5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
15 //**********************************************************************************************************************
16 vector<string> TrimSeqsCommand::setParameters(){
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
19 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
20 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
21 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
22 CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
23 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
24 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
25 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
26 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
27 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
28 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
29 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
30 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
31 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
32 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
33 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
34 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
35 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
36 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
37 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
38 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
39 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
40 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
41 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
42 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
43 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
44 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
45 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
46 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
48 vector<string> myArray;
49 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
53 m->errorOut(e, "TrimSeqsCommand", "setParameters");
57 //**********************************************************************************************************************
58 string TrimSeqsCommand::getHelpString(){
60 string helpString = "";
61 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
62 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
63 helpString += "The trim.seqs command parameters are fasta, name, count, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
64 helpString += "The fasta parameter is required.\n";
65 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
66 helpString += "The oligos parameter allows you to provide an oligos file.\n";
67 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
68 helpString += "The count parameter allows you to provide a count file with your fasta file.\n";
69 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
70 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
71 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
72 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
73 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
74 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
75 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
76 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
77 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
78 helpString += "The qfile parameter allows you to provide a quality file.\n";
79 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
80 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
81 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
82 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
83 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
84 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
85 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
86 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
87 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
88 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
89 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
90 helpString += "The trim.seqs command should be in the following format: \n";
91 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
92 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
93 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
94 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
95 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
99 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
103 //**********************************************************************************************************************
104 string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
106 string outputFileName = "";
107 map<string, vector<string> >::iterator it;
109 //is this a type this command creates
110 it = outputTypes.find(type);
111 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
113 if (type == "qfile") { outputFileName = "qual"; }
114 else if (type == "fasta") { outputFileName = "fasta"; }
115 else if (type == "group") { outputFileName = "groups"; }
116 else if (type == "name") { outputFileName = "names"; }
117 else if (type == "count") { outputFileName = "count_table"; }
118 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
120 return outputFileName;
122 catch(exception& e) {
123 m->errorOut(e, "TrimSeqsCommand", "getOutputFileNameTag");
129 //**********************************************************************************************************************
131 TrimSeqsCommand::TrimSeqsCommand(){
133 abort = true; calledHelp = true;
135 vector<string> tempOutNames;
136 outputTypes["fasta"] = tempOutNames;
137 outputTypes["qfile"] = tempOutNames;
138 outputTypes["group"] = tempOutNames;
139 outputTypes["name"] = tempOutNames;
140 outputTypes["count"] = tempOutNames;
142 catch(exception& e) {
143 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
147 //***************************************************************************************************************
149 TrimSeqsCommand::TrimSeqsCommand(string option) {
152 abort = false; calledHelp = false;
155 //allow user to run help
156 if(option == "help") { help(); abort = true; calledHelp = true; }
157 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
160 vector<string> myArray = setParameters();
162 OptionParser parser(option);
163 map<string,string> parameters = parser.getParameters();
165 ValidParameters validParameter;
166 map<string,string>::iterator it;
168 //check to make sure all parameters are valid for command
169 for (it = parameters.begin(); it != parameters.end(); it++) {
170 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
173 //initialize outputTypes
174 vector<string> tempOutNames;
175 outputTypes["fasta"] = tempOutNames;
176 outputTypes["qfile"] = tempOutNames;
177 outputTypes["group"] = tempOutNames;
178 outputTypes["name"] = tempOutNames;
179 outputTypes["count"] = tempOutNames;
181 //if the user changes the input directory command factory will send this info to us in the output parameter
182 string inputDir = validParameter.validFile(parameters, "inputdir", false);
183 if (inputDir == "not found"){ inputDir = ""; }
186 it = parameters.find("fasta");
187 //user has given a template file
188 if(it != parameters.end()){
189 path = m->hasPath(it->second);
190 //if the user has not given a path then, add inputdir. else leave path alone.
191 if (path == "") { parameters["fasta"] = inputDir + it->second; }
194 it = parameters.find("oligos");
195 //user has given a template file
196 if(it != parameters.end()){
197 path = m->hasPath(it->second);
198 //if the user has not given a path then, add inputdir. else leave path alone.
199 if (path == "") { parameters["oligos"] = inputDir + it->second; }
202 it = parameters.find("qfile");
203 //user has given a template file
204 if(it != parameters.end()){
205 path = m->hasPath(it->second);
206 //if the user has not given a path then, add inputdir. else leave path alone.
207 if (path == "") { parameters["qfile"] = inputDir + it->second; }
210 it = parameters.find("name");
211 //user has given a template file
212 if(it != parameters.end()){
213 path = m->hasPath(it->second);
214 //if the user has not given a path then, add inputdir. else leave path alone.
215 if (path == "") { parameters["name"] = inputDir + it->second; }
218 it = parameters.find("count");
219 //user has given a template file
220 if(it != parameters.end()){
221 path = m->hasPath(it->second);
222 //if the user has not given a path then, add inputdir. else leave path alone.
223 if (path == "") { parameters["count"] = inputDir + it->second; }
229 //check for required parameters
230 fastaFile = validParameter.validFile(parameters, "fasta", true);
231 if (fastaFile == "not found") {
232 fastaFile = m->getFastaFile();
233 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
234 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
235 }else if (fastaFile == "not open") { abort = true; }
236 else { m->setFastaFile(fastaFile); }
238 //if the user changes the output directory command factory will send this info to us in the output parameter
239 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
241 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
245 //check for optional parameter and set defaults
246 // ...at some point should added some additional type checking...
248 temp = validParameter.validFile(parameters, "flip", false);
249 if (temp == "not found") { flip = 0; }
250 else { flip = m->isTrue(temp); }
252 temp = validParameter.validFile(parameters, "oligos", true);
253 if (temp == "not found"){ oligoFile = ""; }
254 else if(temp == "not open"){ abort = true; }
255 else { oligoFile = temp; m->setOligosFile(oligoFile); }
258 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
259 m->mothurConvert(temp, maxAmbig);
261 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
262 m->mothurConvert(temp, maxHomoP);
264 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
265 m->mothurConvert(temp, minLength);
267 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
268 m->mothurConvert(temp, maxLength);
270 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
271 m->mothurConvert(temp, bdiffs);
273 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
274 m->mothurConvert(temp, pdiffs);
276 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
277 m->mothurConvert(temp, ldiffs);
279 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
280 m->mothurConvert(temp, sdiffs);
282 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
283 m->mothurConvert(temp, tdiffs);
285 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
287 temp = validParameter.validFile(parameters, "qfile", true);
288 if (temp == "not found") { qFileName = ""; }
289 else if(temp == "not open") { abort = true; }
290 else { qFileName = temp; m->setQualFile(qFileName); }
292 temp = validParameter.validFile(parameters, "name", true);
293 if (temp == "not found") { nameFile = ""; }
294 else if(temp == "not open") { nameFile = ""; abort = true; }
295 else { nameFile = temp; m->setNameFile(nameFile); }
297 countfile = validParameter.validFile(parameters, "count", true);
298 if (countfile == "not open") { abort = true; countfile = ""; }
299 else if (countfile == "not found") { countfile = ""; }
300 else { m->setCountTableFile(countfile); }
302 if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
304 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
305 m->mothurConvert(temp, qThreshold);
307 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
308 qtrim = m->isTrue(temp);
310 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
311 convert(temp, qRollAverage);
313 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
314 convert(temp, qWindowAverage);
316 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
317 convert(temp, qWindowSize);
319 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
320 convert(temp, qWindowStep);
322 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
323 convert(temp, qAverage);
325 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
326 convert(temp, keepFirst);
328 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
329 convert(temp, removeLast);
331 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
332 allFiles = m->isTrue(temp);
334 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
335 keepforward = m->isTrue(temp);
337 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
338 m->setProcessors(temp);
339 m->mothurConvert(temp, processors);
342 if(allFiles && (oligoFile == "")){
343 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
345 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
346 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
350 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
351 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
355 if (countfile == "") {
356 if (nameFile == "") {
357 vector<string> files; files.push_back(fastaFile);
358 parser.getNameFile(files);
364 catch(exception& e) {
365 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
369 //***************************************************************************************************************
371 int TrimSeqsCommand::execute(){
374 if (abort == true) { if (calledHelp) { return 0; } return 2; }
376 numFPrimers = 0; //this needs to be initialized
381 vector<vector<string> > fastaFileNames;
382 vector<vector<string> > qualFileNames;
383 vector<vector<string> > nameFileNames;
385 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("fasta");
386 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
388 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("fasta");
389 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
391 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("qfile");
392 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("qfile");
394 if (qFileName != "") {
395 outputNames.push_back(trimQualFile);
396 outputNames.push_back(scrapQualFile);
397 outputTypes["qfile"].push_back(trimQualFile);
398 outputTypes["qfile"].push_back(scrapQualFile);
401 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim." + getOutputFileNameTag("name");
402 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap." + getOutputFileNameTag("name");
404 if (nameFile != "") {
405 m->readNames(nameFile, nameMap);
406 outputNames.push_back(trimNameFile);
407 outputNames.push_back(scrapNameFile);
408 outputTypes["name"].push_back(trimNameFile);
409 outputTypes["name"].push_back(scrapNameFile);
412 string trimCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "trim." + getOutputFileNameTag("count");
413 string scrapCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "scrap." + getOutputFileNameTag("count");
415 if (countfile != "") {
417 ct.readTable(countfile);
418 nameCount = ct.getNameMap();
419 outputNames.push_back(trimCountFile);
420 outputNames.push_back(scrapCountFile);
421 outputTypes["count"].push_back(trimCountFile);
422 outputTypes["count"].push_back(scrapCountFile);
426 if (m->control_pressed) { return 0; }
428 string outputGroupFileName;
430 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
431 if ((createGroup) && (countfile == "")){
432 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group");
433 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
437 //fills lines and qlines
438 setLines(fastaFile, qFileName);
441 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
443 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
447 if (m->control_pressed) { return 0; }
450 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
451 map<string, string>::iterator it;
452 set<string> namesToRemove;
453 for(int i=0;i<fastaFileNames.size();i++){
454 for(int j=0;j<fastaFileNames[0].size();j++){
455 if (fastaFileNames[i][j] != "") {
456 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
457 if(m->isBlank(fastaFileNames[i][j])){
458 m->mothurRemove(fastaFileNames[i][j]);
459 namesToRemove.insert(fastaFileNames[i][j]);
462 m->mothurRemove(qualFileNames[i][j]);
463 namesToRemove.insert(qualFileNames[i][j]);
467 m->mothurRemove(nameFileNames[i][j]);
468 namesToRemove.insert(nameFileNames[i][j]);
471 it = uniqueFastaNames.find(fastaFileNames[i][j]);
472 if (it == uniqueFastaNames.end()) {
473 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
481 //remove names for outputFileNames, just cleans up the output
482 vector<string> outputNames2;
483 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
484 outputNames = outputNames2;
486 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
488 m->openInputFile(it->first, in);
491 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first));
492 if (countfile == "") { thisGroupName += getOutputFileNameTag("group"); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); }
493 else { thisGroupName += getOutputFileNameTag("count"); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); }
494 m->openOutputFile(thisGroupName, out);
496 if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; }
499 if (m->control_pressed) { break; }
501 Sequence currSeq(in); m->gobble(in);
502 if (countfile == "") {
503 out << currSeq.getName() << '\t' << it->second << endl;
505 if (nameFile != "") {
506 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
507 if (itName != nameMap.end()) {
508 vector<string> thisSeqsNames;
509 m->splitAtChar(itName->second, thisSeqsNames, ',');
510 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
511 out << thisSeqsNames[k] << '\t' << it->second << endl;
513 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
516 map<string, int>::iterator itTotalReps = nameCount.find(currSeq.getName());
517 if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; }
518 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
525 if (countfile != "") { //create countfile with group info included
526 CountTable* ct = new CountTable();
527 ct->readTable(trimCountFile);
528 map<string, int> justTrimmedNames = ct->getNameMap();
532 for (map<string, int>::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); }
533 vector<int> tempCounts; tempCounts.resize(groupCounts.size(), 0);
534 for (map<string, int>::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) {
535 newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance
536 map<string, string>::iterator it2 = groupMap.find(itNames->first);
537 if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); }
538 else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; }
540 newCt.printTable(trimCountFile);
544 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
546 //output group counts
547 m->mothurOutEndLine();
549 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
550 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
551 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
553 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
555 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
557 //set fasta file as new current fastafile
559 itTypes = outputTypes.find("fasta");
560 if (itTypes != outputTypes.end()) {
561 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
564 itTypes = outputTypes.find("name");
565 if (itTypes != outputTypes.end()) {
566 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
569 itTypes = outputTypes.find("qfile");
570 if (itTypes != outputTypes.end()) {
571 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
574 itTypes = outputTypes.find("group");
575 if (itTypes != outputTypes.end()) {
576 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
579 itTypes = outputTypes.find("count");
580 if (itTypes != outputTypes.end()) {
581 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
584 m->mothurOutEndLine();
585 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
586 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
587 m->mothurOutEndLine();
592 catch(exception& e) {
593 m->errorOut(e, "TrimSeqsCommand", "execute");
598 /**************************************************************************************/
599 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
603 ofstream trimFASTAFile;
604 m->openOutputFile(trimFileName, trimFASTAFile);
606 ofstream scrapFASTAFile;
607 m->openOutputFile(scrapFileName, scrapFASTAFile);
609 ofstream trimQualFile;
610 ofstream scrapQualFile;
612 m->openOutputFile(trimQFileName, trimQualFile);
613 m->openOutputFile(scrapQFileName, scrapQualFile);
616 ofstream trimNameFile;
617 ofstream scrapNameFile;
619 m->openOutputFile(trimNFileName, trimNameFile);
620 m->openOutputFile(scrapNFileName, scrapNameFile);
623 ofstream trimCountFile;
624 ofstream scrapCountFile;
626 m->openOutputFile(trimCFileName, trimCountFile);
627 m->openOutputFile(scrapCFileName, scrapCountFile);
628 if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
631 ofstream outGroupsFile;
632 if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); }
634 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
635 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
636 if (fastaFileNames[i][j] != "") {
638 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
640 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
644 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
652 m->openInputFile(filename, inFASTA);
653 inFASTA.seekg(line.start);
656 if(qFileName != "") {
657 m->openInputFile(qFileName, qFile);
658 qFile.seekg(qline.start);
663 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
667 if (m->control_pressed) {
668 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
669 if ((createGroup) && (countfile == "")) { outGroupsFile.close(); }
670 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
671 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
672 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
673 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;
677 string trashCode = "";
678 int currentSeqsDiffs = 0;
680 Sequence currSeq(inFASTA); m->gobble(inFASTA);
681 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
683 QualityScores currQual;
685 currQual = QualityScores(qFile); m->gobble(qFile);
686 //cout << currQual.getName() << endl;
689 string origSeq = currSeq.getUnaligned();
692 int barcodeIndex = 0;
696 success = trimOligos.stripLinker(currSeq, currQual);
697 if(success > ldiffs) { trashCode += 'k'; }
698 else{ currentSeqsDiffs += success; }
702 if(barcodes.size() != 0){
703 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
704 if(success > bdiffs) { trashCode += 'b'; }
705 else{ currentSeqsDiffs += success; }
709 success = trimOligos.stripSpacer(currSeq, currQual);
710 if(success > sdiffs) { trashCode += 's'; }
711 else{ currentSeqsDiffs += success; }
715 if(numFPrimers != 0){
716 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
717 if(success > pdiffs) { trashCode += 'f'; }
718 else{ currentSeqsDiffs += success; }
721 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
723 if(numRPrimers != 0){
724 success = trimOligos.stripReverse(currSeq, currQual);
725 if(!success) { trashCode += 'r'; }
729 success = keepFirstTrim(currSeq, currQual);
733 success = removeLastTrim(currSeq, currQual);
734 if(!success) { trashCode += 'l'; }
739 int origLength = currSeq.getNumBases();
741 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
742 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
743 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
744 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
745 else { success = 1; }
747 //you don't want to trim, if it fails above then scrap it
748 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
750 if(!success) { trashCode += 'q'; }
753 if(minLength > 0 || maxLength > 0){
754 success = cullLength(currSeq);
755 if(!success) { trashCode += 'l'; }
758 success = cullHomoP(currSeq);
759 if(!success) { trashCode += 'h'; }
762 success = cullAmbigs(currSeq);
763 if(!success) { trashCode += 'n'; }
766 if(flip){ // should go last
767 currSeq.reverseComplement();
769 currQual.flipQScores();
773 if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } }
775 if(trashCode.length() == 0){
776 currSeq.setAligned(currSeq.getUnaligned());
777 currSeq.printSequence(trimFASTAFile);
780 currQual.printQScores(trimQualFile);
785 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
786 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
787 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
790 int numRedundants = 0;
791 if (countfile != "") {
792 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
793 if (itCount != nameCount.end()) {
794 trimCountFile << itCount->first << '\t' << itCount->second << endl;
795 numRedundants = itCount->second-1;
796 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
800 if(barcodes.size() != 0){
801 string thisGroup = barcodeNameVector[barcodeIndex];
802 if (primers.size() != 0) {
803 if (primerNameVector[primerIndex] != "") {
804 if(thisGroup != "") {
805 thisGroup += "." + primerNameVector[primerIndex];
807 thisGroup = primerNameVector[primerIndex];
812 if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
814 if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
815 else { groupMap[currSeq.getName()] = thisGroup; }
817 if (nameFile != "") {
818 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
819 if (itName != nameMap.end()) {
820 vector<string> thisSeqsNames;
821 m->splitAtChar(itName->second, thisSeqsNames, ',');
822 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
823 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
824 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
826 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
829 map<string, int>::iterator it = groupCounts.find(thisGroup);
830 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
831 else { groupCounts[it->first] += (1 + numRedundants); }
838 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
839 currSeq.printSequence(output);
843 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
844 currQual.printQScores(output);
849 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
850 if (itName != nameMap.end()) {
851 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
852 output << itName->first << '\t' << itName->second << endl;
854 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
859 if(nameFile != ""){ //needs to be before the currSeq name is changed
860 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
861 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
862 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
864 if (countfile != "") {
865 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
866 if (itCount != nameCount.end()) {
867 trimCountFile << itCount->first << '\t' << itCount->second << endl;
868 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
871 currSeq.setName(currSeq.getName() + '|' + trashCode);
872 currSeq.setUnaligned(origSeq);
873 currSeq.setAligned(origSeq);
874 currSeq.printSequence(scrapFASTAFile);
876 currQual.printQScores(scrapQualFile);
882 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
883 unsigned long long pos = inFASTA.tellg();
884 if ((pos == -1) || (pos >= line.end)) { break; }
887 if (inFASTA.eof()) { break; }
891 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
895 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
899 trimFASTAFile.close();
900 scrapFASTAFile.close();
901 if (createGroup) { outGroupsFile.close(); }
902 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
903 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
904 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
908 catch(exception& e) {
909 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
914 /**************************************************************************************************/
916 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
923 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
924 //loop through and create all the processes you want
925 while (process != processors) {
929 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
933 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
934 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
935 vector<vector<string> > tempNameFileNames = nameFileNames;
940 for(int i=0;i<tempFASTAFileNames.size();i++){
941 for(int j=0;j<tempFASTAFileNames[i].size();j++){
942 if (tempFASTAFileNames[i][j] != "") {
943 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
944 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
947 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
948 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
951 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
952 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
959 driverCreateTrim(filename,
961 (trimFASTAFileName + toString(getpid()) + ".temp"),
962 (scrapFASTAFileName + toString(getpid()) + ".temp"),
963 (trimQualFileName + toString(getpid()) + ".temp"),
964 (scrapQualFileName + toString(getpid()) + ".temp"),
965 (trimNameFileName + toString(getpid()) + ".temp"),
966 (scrapNameFileName + toString(getpid()) + ".temp"),
967 (trimCountFileName + toString(getpid()) + ".temp"),
968 (scrapCountFileName + toString(getpid()) + ".temp"),
969 (groupFile + toString(getpid()) + ".temp"),
971 tempPrimerQualFileNames,
976 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
978 //pass groupCounts to parent
981 string tempFile = filename + toString(getpid()) + ".num.temp";
982 m->openOutputFile(tempFile, out);
984 out << groupCounts.size() << endl;
986 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
987 out << it->first << '\t' << it->second << endl;
990 out << groupMap.size() << endl;
991 for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
992 out << it->first << '\t' << it->second << endl;
998 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
999 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1006 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1007 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1008 if(qFileName != ""){
1009 m->openOutputFile(trimQualFileName, temp); temp.close();
1010 m->openOutputFile(scrapQualFileName, temp); temp.close();
1012 if (nameFile != "") {
1013 m->openOutputFile(trimNameFileName, temp); temp.close();
1014 m->openOutputFile(scrapNameFileName, temp); temp.close();
1016 if (countfile != "") {
1017 m->openOutputFile(trimCountFileName, temp); temp.close();
1018 m->openOutputFile(scrapCountFileName, temp); temp.close();
1021 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
1023 //force parent to wait until all the processes are done
1024 for (int i=0;i<processIDS.size();i++) {
1025 int temp = processIDS[i];
1029 //////////////////////////////////////////////////////////////////////////////////////////////////////
1030 //Windows version shared memory, so be careful when passing variables through the trimData struct.
1031 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1032 //////////////////////////////////////////////////////////////////////////////////////////////////////
1034 vector<trimData*> pDataArray;
1035 DWORD dwThreadIdArray[processors-1];
1036 HANDLE hThreadArray[processors-1];
1038 //Create processor worker threads.
1039 for( int h=0; h<processors-1; h++){
1041 string extension = "";
1042 if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); }
1043 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
1044 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
1045 vector<vector<string> > tempNameFileNames = nameFileNames;
1050 for(int i=0;i<tempFASTAFileNames.size();i++){
1051 for(int j=0;j<tempFASTAFileNames[i].size();j++){
1052 if (tempFASTAFileNames[i][j] != "") {
1053 tempFASTAFileNames[i][j] += extension;
1054 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
1056 if(qFileName != ""){
1057 tempPrimerQualFileNames[i][j] += extension;
1058 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
1061 tempNameFileNames[i][j] += extension;
1062 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
1070 trimData* tempTrim = new trimData(filename,
1071 qFileName, nameFile, countfile,
1072 (trimFASTAFileName+extension),
1073 (scrapFASTAFileName+extension),
1074 (trimQualFileName+extension),
1075 (scrapQualFileName+extension),
1076 (trimNameFileName+extension),
1077 (scrapNameFileName+extension),
1078 (trimCountFileName+extension),
1079 (scrapCountFileName+extension),
1080 (groupFile+extension),
1082 tempPrimerQualFileNames,
1084 lines[h].start, lines[h].end, qLines[h].start, qLines[h].end, m,
1085 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
1086 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
1087 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
1088 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount);
1089 pDataArray.push_back(tempTrim);
1091 hThreadArray[h] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]);
1096 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1097 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1098 if(qFileName != ""){
1099 m->openOutputFile(trimQualFileName, temp); temp.close();
1100 m->openOutputFile(scrapQualFileName, temp); temp.close();
1102 if (nameFile != "") {
1103 m->openOutputFile(trimNameFileName, temp); temp.close();
1104 m->openOutputFile(scrapNameFileName, temp); temp.close();
1106 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
1107 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
1108 vector<vector<string> > tempNameFileNames = nameFileNames;
1111 string extension = toString(processors-1) + ".temp";
1112 for(int i=0;i<tempFASTAFileNames.size();i++){
1113 for(int j=0;j<tempFASTAFileNames[i].size();j++){
1114 if (tempFASTAFileNames[i][j] != "") {
1115 tempFASTAFileNames[i][j] += extension;
1116 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
1118 if(qFileName != ""){
1119 tempPrimerQualFileNames[i][j] += extension;
1120 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
1123 tempNameFileNames[i][j] += extension;
1124 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
1131 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, lines[processors-1], qLines[processors-1]);
1132 processIDS.push_back(processors-1);
1135 //Wait until all threads have terminated.
1136 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1138 //Close all thread handles and free memory allocations.
1139 for(int i=0; i < pDataArray.size(); i++){
1140 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
1141 map<string, int>::iterator it2 = groupCounts.find(it->first);
1142 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
1143 else { groupCounts[it->first] += it->second; }
1145 for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
1146 map<string, string>::iterator it2 = groupMap.find(it->first);
1147 if (it2 == groupMap.end()) { groupMap[it->first] = it->second; }
1148 else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1150 CloseHandle(hThreadArray[i]);
1151 delete pDataArray[i];
1158 for(int i=0;i<processIDS.size();i++){
1160 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1162 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
1163 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
1164 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
1165 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
1167 if(qFileName != ""){
1168 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
1169 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
1170 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
1171 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
1175 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
1176 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
1177 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1178 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1181 if(countfile != ""){
1182 m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName);
1183 m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp"));
1184 m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName);
1185 m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp"));
1188 if((createGroup)&&(countfile == "")){
1189 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1190 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1195 for(int j=0;j<fastaFileNames.size();j++){
1196 for(int k=0;k<fastaFileNames[j].size();k++){
1197 if (fastaFileNames[j][k] != "") {
1198 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1199 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1201 if(qFileName != ""){
1202 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1203 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1207 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1208 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1215 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1218 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1219 m->openInputFile(tempFile, in);
1223 in >> tempNum; m->gobble(in);
1226 for (int i = 0; i < tempNum; i++) {
1228 in >> group >> groupNum; m->gobble(in);
1230 map<string, int>::iterator it = groupCounts.find(group);
1231 if (it == groupCounts.end()) { groupCounts[group] = groupNum; }
1232 else { groupCounts[it->first] += groupNum; }
1235 in >> tempNum; m->gobble(in);
1237 for (int i = 0; i < tempNum; i++) {
1238 string group, seqName;
1239 in >> seqName >> group; m->gobble(in);
1241 map<string, string>::iterator it = groupMap.find(seqName);
1242 if (it == groupMap.end()) { groupMap[seqName] = group; }
1243 else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1247 in.close(); m->mothurRemove(tempFile);
1254 catch(exception& e) {
1255 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1260 /**************************************************************************************************/
1262 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1265 vector<unsigned long long> fastaFilePos;
1266 vector<unsigned long long> qfileFilePos;
1268 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1269 //set file positions for fasta file
1270 fastaFilePos = m->divideFile(filename, processors);
1272 //get name of first sequence in each chunk
1273 map<string, int> firstSeqNames;
1274 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1276 m->openInputFile(filename, in);
1277 in.seekg(fastaFilePos[i]);
1280 firstSeqNames[temp.getName()] = i;
1285 if(qfilename != "") {
1286 //seach for filePos of each first name in the qfile and save in qfileFilePos
1288 m->openInputFile(qfilename, inQual);
1291 while(!inQual.eof()){
1292 input = m->getline(inQual);
1294 if (input.length() != 0) {
1295 if(input[0] == '>'){ //this is a sequence name line
1296 istringstream nameStream(input);
1298 string sname = ""; nameStream >> sname;
1299 sname = sname.substr(1);
1301 map<string, int>::iterator it = firstSeqNames.find(sname);
1303 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1304 unsigned long long pos = inQual.tellg();
1305 qfileFilePos.push_back(pos - input.length() - 1);
1306 firstSeqNames.erase(it);
1311 if (firstSeqNames.size() == 0) { break; }
1316 if (firstSeqNames.size() != 0) {
1317 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1318 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1324 //get last file position of qfile
1326 unsigned long long size;
1328 //get num bytes in file
1329 pFile = fopen (qfilename.c_str(),"rb");
1330 if (pFile==NULL) perror ("Error opening file");
1332 fseek (pFile, 0, SEEK_END);
1337 qfileFilePos.push_back(size);
1340 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1341 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); }
1342 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1343 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1345 if(qfilename == "") { qLines = lines; } //files with duds
1351 if (processors == 1) { //save time
1352 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1353 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1354 lines.push_back(linePair(0, 1000));
1355 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1357 int numFastaSeqs = 0;
1358 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1359 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1361 if (qfilename != "") {
1362 int numQualSeqs = 0;
1363 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1365 if (numFastaSeqs != numQualSeqs) {
1366 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1370 //figure out how many sequences you have to process
1371 int numSeqsPerProcessor = numFastaSeqs / processors;
1372 for (int i = 0; i < processors; i++) {
1373 int startIndex = i * numSeqsPerProcessor;
1374 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1375 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1376 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1379 if(qfilename == "") { qLines = lines; } //files with duds
1384 catch(exception& e) {
1385 m->errorOut(e, "TrimSeqsCommand", "setLines");
1390 //***************************************************************************************************************
1392 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1395 m->openInputFile(oligoFile, inOligos);
1399 string type, oligo, group;
1401 int indexPrimer = 0;
1402 int indexBarcode = 0;
1404 while(!inOligos.eof()){
1408 if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
1411 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1412 m->gobble(inOligos);
1415 m->gobble(inOligos);
1416 //make type case insensitive
1417 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1421 if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); }
1423 for(int i=0;i<oligo.length();i++){
1424 oligo[i] = toupper(oligo[i]);
1425 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1428 if(type == "FORWARD"){
1431 // get rest of line in case there is a primer name
1432 while (!inOligos.eof()) {
1433 char c = inOligos.get();
1434 if (c == 10 || c == 13){ break; }
1435 else if (c == 32 || c == 9){;} //space or tab
1436 else { group += c; }
1439 //check for repeat barcodes
1440 map<string, int>::iterator itPrime = primers.find(oligo);
1441 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1443 if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } }
1445 primers[oligo]=indexPrimer; indexPrimer++;
1446 primerNameVector.push_back(group);
1448 else if(type == "REVERSE"){
1449 //Sequence oligoRC("reverse", oligo);
1450 //oligoRC.reverseComplement();
1451 string oligoRC = reverseOligo(oligo);
1452 revPrimer.push_back(oligoRC);
1454 else if(type == "BARCODE"){
1457 //check for repeat barcodes
1458 map<string, int>::iterator itBar = barcodes.find(oligo);
1459 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1461 barcodes[oligo]=indexBarcode; indexBarcode++;
1462 barcodeNameVector.push_back(group);
1463 }else if(type == "LINKER"){
1464 linker.push_back(oligo);
1465 }else if(type == "SPACER"){
1466 spacer.push_back(oligo);
1468 else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1470 m->gobble(inOligos);
1474 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1476 //add in potential combos
1477 if(barcodeNameVector.size() == 0){
1479 barcodeNameVector.push_back("");
1482 if(primerNameVector.size() == 0){
1484 primerNameVector.push_back("");
1487 fastaFileNames.resize(barcodeNameVector.size());
1488 for(int i=0;i<fastaFileNames.size();i++){
1489 fastaFileNames[i].assign(primerNameVector.size(), "");
1491 if(qFileName != "") { qualFileNames = fastaFileNames; }
1492 if(nameFile != "") { nameFileNames = fastaFileNames; }
1495 set<string> uniqueNames; //used to cleanup outputFileNames
1496 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1497 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1499 string primerName = primerNameVector[itPrimer->second];
1500 string barcodeName = barcodeNameVector[itBar->second];
1502 string comboGroupName = "";
1503 string fastaFileName = "";
1504 string qualFileName = "";
1505 string nameFileName = "";
1506 string countFileName = "";
1508 if(primerName == ""){
1509 comboGroupName = barcodeNameVector[itBar->second];
1512 if(barcodeName == ""){
1513 comboGroupName = primerNameVector[itPrimer->second];
1516 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1522 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1523 if (uniqueNames.count(fastaFileName) == 0) {
1524 outputNames.push_back(fastaFileName);
1525 outputTypes["fasta"].push_back(fastaFileName);
1526 uniqueNames.insert(fastaFileName);
1529 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1530 m->openOutputFile(fastaFileName, temp); temp.close();
1532 if(qFileName != ""){
1533 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1534 if (uniqueNames.count(qualFileName) == 0) {
1535 outputNames.push_back(qualFileName);
1536 outputTypes["qfile"].push_back(qualFileName);
1539 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1540 m->openOutputFile(qualFileName, temp); temp.close();
1544 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1545 if (uniqueNames.count(nameFileName) == 0) {
1546 outputNames.push_back(nameFileName);
1547 outputTypes["name"].push_back(nameFileName);
1550 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1551 m->openOutputFile(nameFileName, temp); temp.close();
1556 numFPrimers = primers.size();
1557 numRPrimers = revPrimer.size();
1558 numLinkers = linker.size();
1559 numSpacers = spacer.size();
1561 bool allBlank = true;
1562 for (int i = 0; i < barcodeNameVector.size(); i++) {
1563 if (barcodeNameVector[i] != "") {
1568 for (int i = 0; i < primerNameVector.size(); i++) {
1569 if (primerNameVector[i] != "") {
1576 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1584 catch(exception& e) {
1585 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1589 //***************************************************************************************************************
1591 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1594 if(qscores.getName() != ""){
1595 qscores.trimQScores(-1, keepFirst);
1597 sequence.trim(keepFirst);
1600 catch(exception& e) {
1601 m->errorOut(e, "keepFirstTrim", "countDiffs");
1607 //***************************************************************************************************************
1609 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1613 int length = sequence.getNumBases() - removeLast;
1616 if(qscores.getName() != ""){
1617 qscores.trimQScores(-1, length);
1619 sequence.trim(length);
1628 catch(exception& e) {
1629 m->errorOut(e, "removeLastTrim", "countDiffs");
1635 //***************************************************************************************************************
1637 bool TrimSeqsCommand::cullLength(Sequence& seq){
1640 int length = seq.getNumBases();
1641 bool success = 0; //guilty until proven innocent
1643 if(length >= minLength && maxLength == 0) { success = 1; }
1644 else if(length >= minLength && length <= maxLength) { success = 1; }
1645 else { success = 0; }
1650 catch(exception& e) {
1651 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1657 //***************************************************************************************************************
1659 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1661 int longHomoP = seq.getLongHomoPolymer();
1662 bool success = 0; //guilty until proven innocent
1664 if(longHomoP <= maxHomoP){ success = 1; }
1665 else { success = 0; }
1669 catch(exception& e) {
1670 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1675 //********************************************************************/
1676 string TrimSeqsCommand::reverseOligo(string oligo){
1678 string reverse = "";
1680 for(int i=oligo.length()-1;i>=0;i--){
1682 if(oligo[i] == 'A') { reverse += 'T'; }
1683 else if(oligo[i] == 'T'){ reverse += 'A'; }
1684 else if(oligo[i] == 'U'){ reverse += 'A'; }
1686 else if(oligo[i] == 'G'){ reverse += 'C'; }
1687 else if(oligo[i] == 'C'){ reverse += 'G'; }
1689 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1690 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1692 else if(oligo[i] == 'M'){ reverse += 'K'; }
1693 else if(oligo[i] == 'K'){ reverse += 'M'; }
1695 else if(oligo[i] == 'W'){ reverse += 'W'; }
1696 else if(oligo[i] == 'S'){ reverse += 'S'; }
1698 else if(oligo[i] == 'B'){ reverse += 'V'; }
1699 else if(oligo[i] == 'V'){ reverse += 'B'; }
1701 else if(oligo[i] == 'D'){ reverse += 'H'; }
1702 else if(oligo[i] == 'H'){ reverse += 'D'; }
1704 else { reverse += 'N'; }
1710 catch(exception& e) {
1711 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1716 //***************************************************************************************************************
1718 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1720 int numNs = seq.getAmbigBases();
1721 bool success = 0; //guilty until proven innocent
1723 if(numNs <= maxAmbig) { success = 1; }
1724 else { success = 0; }
1728 catch(exception& e) {
1729 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1734 //***************************************************************************************************************