5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
15 //**********************************************************************************************************************
16 vector<string> TrimSeqsCommand::setParameters(){
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
19 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
20 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
21 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
22 CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
23 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
24 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
25 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
26 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
27 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
28 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
29 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
30 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
31 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
32 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
33 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
34 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
35 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
36 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
37 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
38 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
39 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
40 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
41 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
42 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
43 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
44 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
45 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
46 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
48 vector<string> myArray;
49 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
53 m->errorOut(e, "TrimSeqsCommand", "setParameters");
57 //**********************************************************************************************************************
58 string TrimSeqsCommand::getHelpString(){
60 string helpString = "";
61 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
62 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
63 helpString += "The trim.seqs command parameters are fasta, name, count, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
64 helpString += "The fasta parameter is required.\n";
65 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
66 helpString += "The oligos parameter allows you to provide an oligos file.\n";
67 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
68 helpString += "The count parameter allows you to provide a count file with your fasta file.\n";
69 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
70 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
71 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
72 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
73 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
74 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
75 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
76 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
77 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
78 helpString += "The qfile parameter allows you to provide a quality file.\n";
79 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
80 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
81 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
82 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
83 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
84 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
85 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
86 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
87 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
88 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
89 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
90 helpString += "The trim.seqs command should be in the following format: \n";
91 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
92 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
93 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
94 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
95 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
99 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
103 //**********************************************************************************************************************
104 string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
106 string outputFileName = "";
107 map<string, vector<string> >::iterator it;
109 //is this a type this command creates
110 it = outputTypes.find(type);
111 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
113 if (type == "qfile") { outputFileName = "qual"; }
114 else if (type == "fasta") { outputFileName = "fasta"; }
115 else if (type == "group") { outputFileName = "groups"; }
116 else if (type == "name") { outputFileName = "names"; }
117 else if (type == "count") { outputFileName = "count.table"; }
118 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
120 return outputFileName;
122 catch(exception& e) {
123 m->errorOut(e, "TrimSeqsCommand", "getOutputFileNameTag");
129 //**********************************************************************************************************************
131 TrimSeqsCommand::TrimSeqsCommand(){
133 abort = true; calledHelp = true;
135 vector<string> tempOutNames;
136 outputTypes["fasta"] = tempOutNames;
137 outputTypes["qfile"] = tempOutNames;
138 outputTypes["group"] = tempOutNames;
139 outputTypes["name"] = tempOutNames;
140 outputTypes["count"] = tempOutNames;
142 catch(exception& e) {
143 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
147 //***************************************************************************************************************
149 TrimSeqsCommand::TrimSeqsCommand(string option) {
152 abort = false; calledHelp = false;
155 //allow user to run help
156 if(option == "help") { help(); abort = true; calledHelp = true; }
157 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
160 vector<string> myArray = setParameters();
162 OptionParser parser(option);
163 map<string,string> parameters = parser.getParameters();
165 ValidParameters validParameter;
166 map<string,string>::iterator it;
168 //check to make sure all parameters are valid for command
169 for (it = parameters.begin(); it != parameters.end(); it++) {
170 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
173 //initialize outputTypes
174 vector<string> tempOutNames;
175 outputTypes["fasta"] = tempOutNames;
176 outputTypes["qfile"] = tempOutNames;
177 outputTypes["group"] = tempOutNames;
178 outputTypes["name"] = tempOutNames;
179 outputTypes["count"] = tempOutNames;
181 //if the user changes the input directory command factory will send this info to us in the output parameter
182 string inputDir = validParameter.validFile(parameters, "inputdir", false);
183 if (inputDir == "not found"){ inputDir = ""; }
186 it = parameters.find("fasta");
187 //user has given a template file
188 if(it != parameters.end()){
189 path = m->hasPath(it->second);
190 //if the user has not given a path then, add inputdir. else leave path alone.
191 if (path == "") { parameters["fasta"] = inputDir + it->second; }
194 it = parameters.find("oligos");
195 //user has given a template file
196 if(it != parameters.end()){
197 path = m->hasPath(it->second);
198 //if the user has not given a path then, add inputdir. else leave path alone.
199 if (path == "") { parameters["oligos"] = inputDir + it->second; }
202 it = parameters.find("qfile");
203 //user has given a template file
204 if(it != parameters.end()){
205 path = m->hasPath(it->second);
206 //if the user has not given a path then, add inputdir. else leave path alone.
207 if (path == "") { parameters["qfile"] = inputDir + it->second; }
210 it = parameters.find("name");
211 //user has given a template file
212 if(it != parameters.end()){
213 path = m->hasPath(it->second);
214 //if the user has not given a path then, add inputdir. else leave path alone.
215 if (path == "") { parameters["name"] = inputDir + it->second; }
218 it = parameters.find("count");
219 //user has given a template file
220 if(it != parameters.end()){
221 path = m->hasPath(it->second);
222 //if the user has not given a path then, add inputdir. else leave path alone.
223 if (path == "") { parameters["count"] = inputDir + it->second; }
229 //check for required parameters
230 fastaFile = validParameter.validFile(parameters, "fasta", true);
231 if (fastaFile == "not found") {
232 fastaFile = m->getFastaFile();
233 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
234 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
235 }else if (fastaFile == "not open") { abort = true; }
236 else { m->setFastaFile(fastaFile); }
238 //if the user changes the output directory command factory will send this info to us in the output parameter
239 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
241 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
245 //check for optional parameter and set defaults
246 // ...at some point should added some additional type checking...
248 temp = validParameter.validFile(parameters, "flip", false);
249 if (temp == "not found") { flip = 0; }
250 else { flip = m->isTrue(temp); }
252 temp = validParameter.validFile(parameters, "oligos", true);
253 if (temp == "not found"){ oligoFile = ""; }
254 else if(temp == "not open"){ abort = true; }
255 else { oligoFile = temp; m->setOligosFile(oligoFile); }
258 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
259 m->mothurConvert(temp, maxAmbig);
261 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
262 m->mothurConvert(temp, maxHomoP);
264 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
265 m->mothurConvert(temp, minLength);
267 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
268 m->mothurConvert(temp, maxLength);
270 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
271 m->mothurConvert(temp, bdiffs);
273 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
274 m->mothurConvert(temp, pdiffs);
276 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
277 m->mothurConvert(temp, ldiffs);
279 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
280 m->mothurConvert(temp, sdiffs);
282 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
283 m->mothurConvert(temp, tdiffs);
285 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
287 temp = validParameter.validFile(parameters, "qfile", true);
288 if (temp == "not found") { qFileName = ""; }
289 else if(temp == "not open") { abort = true; }
290 else { qFileName = temp; m->setQualFile(qFileName); }
292 temp = validParameter.validFile(parameters, "name", true);
293 if (temp == "not found") { nameFile = ""; }
294 else if(temp == "not open") { nameFile = ""; abort = true; }
295 else { nameFile = temp; m->setNameFile(nameFile); }
297 countfile = validParameter.validFile(parameters, "count", true);
298 if (countfile == "not open") { abort = true; countfile = ""; }
299 else if (countfile == "not found") { countfile = ""; }
300 else { m->setCountTableFile(countfile); }
302 if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
304 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
305 m->mothurConvert(temp, qThreshold);
307 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
308 qtrim = m->isTrue(temp);
310 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
311 convert(temp, qRollAverage);
313 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
314 convert(temp, qWindowAverage);
316 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
317 convert(temp, qWindowSize);
319 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
320 convert(temp, qWindowStep);
322 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
323 convert(temp, qAverage);
325 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
326 convert(temp, keepFirst);
328 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
329 convert(temp, removeLast);
331 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
332 allFiles = m->isTrue(temp);
334 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
335 keepforward = m->isTrue(temp);
337 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
338 m->setProcessors(temp);
339 m->mothurConvert(temp, processors);
342 if(allFiles && (oligoFile == "")){
343 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
345 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
346 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
350 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
351 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
355 if (countfile == "") {
356 if (nameFile == "") {
357 vector<string> files; files.push_back(fastaFile);
358 parser.getNameFile(files);
364 catch(exception& e) {
365 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
369 //***************************************************************************************************************
371 int TrimSeqsCommand::execute(){
374 if (abort == true) { if (calledHelp) { return 0; } return 2; }
376 numFPrimers = 0; //this needs to be initialized
381 vector<vector<string> > fastaFileNames;
382 vector<vector<string> > qualFileNames;
383 vector<vector<string> > nameFileNames;
385 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("fasta");
386 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
388 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("fasta");
389 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
391 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("qfile");
392 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("qfile");
394 if (qFileName != "") {
395 outputNames.push_back(trimQualFile);
396 outputNames.push_back(scrapQualFile);
397 outputTypes["qfile"].push_back(trimQualFile);
398 outputTypes["qfile"].push_back(scrapQualFile);
401 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim." + getOutputFileNameTag("name");
402 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap." + getOutputFileNameTag("name");
404 if (nameFile != "") {
405 m->readNames(nameFile, nameMap);
406 outputNames.push_back(trimNameFile);
407 outputNames.push_back(scrapNameFile);
408 outputTypes["name"].push_back(trimNameFile);
409 outputTypes["name"].push_back(scrapNameFile);
412 string trimCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "trim." + getOutputFileNameTag("count");
413 string scrapCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "scrap." + getOutputFileNameTag("count");
415 if (countfile != "") {
417 ct.readTable(countfile);
418 nameCount = ct.getNameMap();
419 outputNames.push_back(trimCountFile);
420 outputNames.push_back(scrapCountFile);
421 outputTypes["count"].push_back(trimCountFile);
422 outputTypes["count"].push_back(scrapCountFile);
426 if (m->control_pressed) { return 0; }
428 string outputGroupFileName;
430 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
431 if ((createGroup) && (countfile == "")){
432 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group");
433 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
437 //fills lines and qlines
438 setLines(fastaFile, qFileName);
441 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
443 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
447 if (m->control_pressed) { return 0; }
450 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
451 map<string, string>::iterator it;
452 set<string> namesToRemove;
453 for(int i=0;i<fastaFileNames.size();i++){
454 for(int j=0;j<fastaFileNames[0].size();j++){
455 if (fastaFileNames[i][j] != "") {
456 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
457 if(m->isBlank(fastaFileNames[i][j])){
458 m->mothurRemove(fastaFileNames[i][j]);
459 namesToRemove.insert(fastaFileNames[i][j]);
462 m->mothurRemove(qualFileNames[i][j]);
463 namesToRemove.insert(qualFileNames[i][j]);
467 m->mothurRemove(nameFileNames[i][j]);
468 namesToRemove.insert(nameFileNames[i][j]);
471 it = uniqueFastaNames.find(fastaFileNames[i][j]);
472 if (it == uniqueFastaNames.end()) {
473 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
481 //remove names for outputFileNames, just cleans up the output
482 vector<string> outputNames2;
483 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
484 outputNames = outputNames2;
486 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
488 m->openInputFile(it->first, in);
491 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first));
492 if (countfile == "") { thisGroupName += getOutputFileNameTag("group"); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); }
493 else { thisGroupName += getOutputFileNameTag("count"); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); }
494 m->openOutputFile(thisGroupName, out);
496 if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; }
499 if (m->control_pressed) { break; }
501 Sequence currSeq(in); m->gobble(in);
502 if (countfile == "") {
503 out << currSeq.getName() << '\t' << it->second << endl;
505 if (nameFile != "") {
506 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
507 if (itName != nameMap.end()) {
508 vector<string> thisSeqsNames;
509 m->splitAtChar(itName->second, thisSeqsNames, ',');
510 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
511 out << thisSeqsNames[k] << '\t' << it->second << endl;
513 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
516 map<string, int>::iterator itTotalReps = nameCount.find(currSeq.getName());
517 if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; }
518 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
525 if (countfile != "") { //create countfile with group info included
526 CountTable* ct = new CountTable();
527 ct->readTable(trimCountFile);
528 map<string, int> justTrimmedNames = ct->getNameMap();
532 for (map<string, int>::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); }
533 vector<int> tempCounts; tempCounts.resize(groupCounts.size(), 0);
534 for (map<string, int>::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) {
535 newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance
536 map<string, string>::iterator it2 = groupMap.find(itNames->first);
537 if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); }
538 else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; }
540 newCt.printTable(trimCountFile);
544 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
546 //output group counts
547 m->mothurOutEndLine();
549 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
550 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
551 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
553 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
555 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
557 //set fasta file as new current fastafile
559 itTypes = outputTypes.find("fasta");
560 if (itTypes != outputTypes.end()) {
561 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
564 itTypes = outputTypes.find("name");
565 if (itTypes != outputTypes.end()) {
566 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
569 itTypes = outputTypes.find("qfile");
570 if (itTypes != outputTypes.end()) {
571 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
574 itTypes = outputTypes.find("group");
575 if (itTypes != outputTypes.end()) {
576 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
579 itTypes = outputTypes.find("count");
580 if (itTypes != outputTypes.end()) {
581 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
584 m->mothurOutEndLine();
585 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
586 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
587 m->mothurOutEndLine();
592 catch(exception& e) {
593 m->errorOut(e, "TrimSeqsCommand", "execute");
598 /**************************************************************************************/
599 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
603 ofstream trimFASTAFile;
604 m->openOutputFile(trimFileName, trimFASTAFile);
606 ofstream scrapFASTAFile;
607 m->openOutputFile(scrapFileName, scrapFASTAFile);
609 ofstream trimQualFile;
610 ofstream scrapQualFile;
612 m->openOutputFile(trimQFileName, trimQualFile);
613 m->openOutputFile(scrapQFileName, scrapQualFile);
616 ofstream trimNameFile;
617 ofstream scrapNameFile;
619 m->openOutputFile(trimNFileName, trimNameFile);
620 m->openOutputFile(scrapNFileName, scrapNameFile);
623 ofstream trimCountFile;
624 ofstream scrapCountFile;
626 m->openOutputFile(trimCFileName, trimCountFile);
627 m->openOutputFile(scrapCFileName, scrapCountFile);
628 if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
631 ofstream outGroupsFile;
632 if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); }
634 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
635 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
636 if (fastaFileNames[i][j] != "") {
638 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
640 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
644 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
652 m->openInputFile(filename, inFASTA);
653 inFASTA.seekg(line.start);
656 if(qFileName != "") {
657 m->openInputFile(qFileName, qFile);
658 qFile.seekg(qline.start);
663 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
667 if (m->control_pressed) {
668 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
669 if ((createGroup) && (countfile == "")) { outGroupsFile.close(); }
670 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
671 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
672 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
673 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;
677 string trashCode = "";
678 int currentSeqsDiffs = 0;
680 Sequence currSeq(inFASTA); m->gobble(inFASTA);
681 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
683 QualityScores currQual;
685 currQual = QualityScores(qFile); m->gobble(qFile);
686 //cout << currQual.getName() << endl;
689 string origSeq = currSeq.getUnaligned();
692 int barcodeIndex = 0;
696 success = trimOligos.stripLinker(currSeq, currQual);
697 if(success > ldiffs) { trashCode += 'k'; }
698 else{ currentSeqsDiffs += success; }
702 if(barcodes.size() != 0){
703 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
704 if(success > bdiffs) { trashCode += 'b'; }
705 else{ currentSeqsDiffs += success; }
708 if(rbarcodes.size() != 0){
709 success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
710 if(success > bdiffs) { trashCode += 'b'; }
711 else{ currentSeqsDiffs += success; }
715 success = trimOligos.stripSpacer(currSeq, currQual);
716 if(success > sdiffs) { trashCode += 's'; }
717 else{ currentSeqsDiffs += success; }
721 if(numFPrimers != 0){
722 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
723 if(success > pdiffs) { trashCode += 'f'; }
724 else{ currentSeqsDiffs += success; }
727 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
729 if(numRPrimers != 0){
730 success = trimOligos.stripReverse(currSeq, currQual);
731 if(!success) { trashCode += 'r'; }
735 success = keepFirstTrim(currSeq, currQual);
739 success = removeLastTrim(currSeq, currQual);
740 if(!success) { trashCode += 'l'; }
745 int origLength = currSeq.getNumBases();
747 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
748 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
749 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
750 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
751 else { success = 1; }
753 //you don't want to trim, if it fails above then scrap it
754 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
756 if(!success) { trashCode += 'q'; }
759 if(minLength > 0 || maxLength > 0){
760 success = cullLength(currSeq);
761 if(!success) { trashCode += 'l'; }
764 success = cullHomoP(currSeq);
765 if(!success) { trashCode += 'h'; }
768 success = cullAmbigs(currSeq);
769 if(!success) { trashCode += 'n'; }
772 if(flip){ // should go last
773 currSeq.reverseComplement();
775 currQual.flipQScores();
779 if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } }
781 if(trashCode.length() == 0){
782 currSeq.setAligned(currSeq.getUnaligned());
783 currSeq.printSequence(trimFASTAFile);
786 currQual.printQScores(trimQualFile);
791 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
792 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
793 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
796 int numRedundants = 0;
797 if (countfile != "") {
798 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
799 if (itCount != nameCount.end()) {
800 trimCountFile << itCount->first << '\t' << itCount->second << endl;
801 numRedundants = itCount->second-1;
802 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
806 if(barcodes.size() != 0){
807 string thisGroup = barcodeNameVector[barcodeIndex];
808 if (primers.size() != 0) {
809 if (primerNameVector[primerIndex] != "") {
810 if(thisGroup != "") {
811 thisGroup += "." + primerNameVector[primerIndex];
813 thisGroup = primerNameVector[primerIndex];
818 if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
820 if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
821 else { groupMap[currSeq.getName()] = thisGroup; }
823 if (nameFile != "") {
824 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
825 if (itName != nameMap.end()) {
826 vector<string> thisSeqsNames;
827 m->splitAtChar(itName->second, thisSeqsNames, ',');
828 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
829 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
830 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
832 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
835 map<string, int>::iterator it = groupCounts.find(thisGroup);
836 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
837 else { groupCounts[it->first] += (1 + numRedundants); }
844 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
845 currSeq.printSequence(output);
849 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
850 currQual.printQScores(output);
855 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
856 if (itName != nameMap.end()) {
857 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
858 output << itName->first << '\t' << itName->second << endl;
860 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
865 if(nameFile != ""){ //needs to be before the currSeq name is changed
866 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
867 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
868 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
870 if (countfile != "") {
871 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
872 if (itCount != nameCount.end()) {
873 trimCountFile << itCount->first << '\t' << itCount->second << endl;
874 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
877 currSeq.setName(currSeq.getName() + '|' + trashCode);
878 currSeq.setUnaligned(origSeq);
879 currSeq.setAligned(origSeq);
880 currSeq.printSequence(scrapFASTAFile);
882 currQual.printQScores(scrapQualFile);
888 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
889 unsigned long long pos = inFASTA.tellg();
890 if ((pos == -1) || (pos >= line.end)) { break; }
893 if (inFASTA.eof()) { break; }
897 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
901 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
905 trimFASTAFile.close();
906 scrapFASTAFile.close();
907 if (createGroup) { outGroupsFile.close(); }
908 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
909 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
910 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
914 catch(exception& e) {
915 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
920 /**************************************************************************************************/
922 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
929 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
930 //loop through and create all the processes you want
931 while (process != processors) {
935 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
939 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
940 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
941 vector<vector<string> > tempNameFileNames = nameFileNames;
946 for(int i=0;i<tempFASTAFileNames.size();i++){
947 for(int j=0;j<tempFASTAFileNames[i].size();j++){
948 if (tempFASTAFileNames[i][j] != "") {
949 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
950 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
953 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
954 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
957 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
958 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
965 driverCreateTrim(filename,
967 (trimFASTAFileName + toString(getpid()) + ".temp"),
968 (scrapFASTAFileName + toString(getpid()) + ".temp"),
969 (trimQualFileName + toString(getpid()) + ".temp"),
970 (scrapQualFileName + toString(getpid()) + ".temp"),
971 (trimNameFileName + toString(getpid()) + ".temp"),
972 (scrapNameFileName + toString(getpid()) + ".temp"),
973 (trimCountFileName + toString(getpid()) + ".temp"),
974 (scrapCountFileName + toString(getpid()) + ".temp"),
975 (groupFile + toString(getpid()) + ".temp"),
977 tempPrimerQualFileNames,
982 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
984 //pass groupCounts to parent
987 string tempFile = filename + toString(getpid()) + ".num.temp";
988 m->openOutputFile(tempFile, out);
990 out << groupCounts.size() << endl;
992 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
993 out << it->first << '\t' << it->second << endl;
996 out << groupMap.size() << endl;
997 for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
998 out << it->first << '\t' << it->second << endl;
1004 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1005 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1012 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1013 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1014 if(qFileName != ""){
1015 m->openOutputFile(trimQualFileName, temp); temp.close();
1016 m->openOutputFile(scrapQualFileName, temp); temp.close();
1018 if (nameFile != "") {
1019 m->openOutputFile(trimNameFileName, temp); temp.close();
1020 m->openOutputFile(scrapNameFileName, temp); temp.close();
1022 if (countfile != "") {
1023 m->openOutputFile(trimCountFileName, temp); temp.close();
1024 m->openOutputFile(scrapCountFileName, temp); temp.close();
1027 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
1029 //force parent to wait until all the processes are done
1030 for (int i=0;i<processIDS.size();i++) {
1031 int temp = processIDS[i];
1035 //////////////////////////////////////////////////////////////////////////////////////////////////////
1036 //Windows version shared memory, so be careful when passing variables through the trimData struct.
1037 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1038 //////////////////////////////////////////////////////////////////////////////////////////////////////
1040 vector<trimData*> pDataArray;
1041 DWORD dwThreadIdArray[processors-1];
1042 HANDLE hThreadArray[processors-1];
1044 //Create processor worker threads.
1045 for( int i=0; i<processors-1; i++){
1047 string extension = "";
1048 if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
1049 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
1050 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
1051 vector<vector<string> > tempNameFileNames = nameFileNames;
1056 for(int i=0;i<tempFASTAFileNames.size();i++){
1057 for(int j=0;j<tempFASTAFileNames[i].size();j++){
1058 if (tempFASTAFileNames[i][j] != "") {
1059 tempFASTAFileNames[i][j] += extension;
1060 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
1062 if(qFileName != ""){
1063 tempPrimerQualFileNames[i][j] += extension;
1064 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
1067 tempNameFileNames[i][j] += extension;
1068 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
1076 trimData* tempTrim = new trimData(filename,
1077 qFileName, nameFile, countfile,
1078 (trimFASTAFileName+extension),
1079 (scrapFASTAFileName+extension),
1080 (trimQualFileName+extension),
1081 (scrapQualFileName+extension),
1082 (trimNameFileName+extension),
1083 (scrapNameFileName+extension),
1084 (trimCountFileName+extension),
1085 (scrapCountFileName+extension),
1086 (groupFile+extension),
1088 tempPrimerQualFileNames,
1090 lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
1091 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer,
1092 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
1093 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
1094 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount);
1095 pDataArray.push_back(tempTrim);
1097 hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
1102 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1103 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1104 if(qFileName != ""){
1105 m->openOutputFile(trimQualFileName, temp); temp.close();
1106 m->openOutputFile(scrapQualFileName, temp); temp.close();
1108 if (nameFile != "") {
1109 m->openOutputFile(trimNameFileName, temp); temp.close();
1110 m->openOutputFile(scrapNameFileName, temp); temp.close();
1113 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
1114 processIDS.push_back(processors-1);
1117 //Wait until all threads have terminated.
1118 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1120 //Close all thread handles and free memory allocations.
1121 for(int i=0; i < pDataArray.size(); i++){
1122 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
1123 map<string, int>::iterator it2 = groupCounts.find(it->first);
1124 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
1125 else { groupCounts[it->first] += it->second; }
1127 for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
1128 map<string, string>::iterator it2 = groupMap.find(it->first);
1129 if (it2 == groupMap.end()) { groupMap[it->first] = it->second; }
1130 else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1132 CloseHandle(hThreadArray[i]);
1133 delete pDataArray[i];
1140 for(int i=0;i<processIDS.size();i++){
1142 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1144 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
1145 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
1146 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
1147 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
1149 if(qFileName != ""){
1150 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
1151 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
1152 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
1153 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
1157 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
1158 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
1159 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1160 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1163 if(countfile != ""){
1164 m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName);
1165 m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp"));
1166 m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName);
1167 m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp"));
1170 if((createGroup)&&(countfile == "")){
1171 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1172 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1177 for(int j=0;j<fastaFileNames.size();j++){
1178 for(int k=0;k<fastaFileNames[j].size();k++){
1179 if (fastaFileNames[j][k] != "") {
1180 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1181 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1183 if(qFileName != ""){
1184 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1185 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1189 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1190 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1197 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1200 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1201 m->openInputFile(tempFile, in);
1205 in >> tempNum; m->gobble(in);
1208 for (int i = 0; i < tempNum; i++) {
1210 in >> group >> groupNum; m->gobble(in);
1212 map<string, int>::iterator it = groupCounts.find(group);
1213 if (it == groupCounts.end()) { groupCounts[group] = groupNum; }
1214 else { groupCounts[it->first] += groupNum; }
1217 in >> tempNum; m->gobble(in);
1219 for (int i = 0; i < tempNum; i++) {
1220 string group, seqName;
1221 in >> seqName >> group; m->gobble(in);
1223 map<string, string>::iterator it = groupMap.find(seqName);
1224 if (it == groupMap.end()) { groupMap[seqName] = group; }
1225 else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1229 in.close(); m->mothurRemove(tempFile);
1236 catch(exception& e) {
1237 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1242 /**************************************************************************************************/
1244 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1247 vector<unsigned long long> fastaFilePos;
1248 vector<unsigned long long> qfileFilePos;
1250 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1251 //set file positions for fasta file
1252 fastaFilePos = m->divideFile(filename, processors);
1254 //get name of first sequence in each chunk
1255 map<string, int> firstSeqNames;
1256 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1258 m->openInputFile(filename, in);
1259 in.seekg(fastaFilePos[i]);
1262 firstSeqNames[temp.getName()] = i;
1267 if(qfilename != "") {
1268 //seach for filePos of each first name in the qfile and save in qfileFilePos
1270 m->openInputFile(qfilename, inQual);
1273 while(!inQual.eof()){
1274 input = m->getline(inQual);
1276 if (input.length() != 0) {
1277 if(input[0] == '>'){ //this is a sequence name line
1278 istringstream nameStream(input);
1280 string sname = ""; nameStream >> sname;
1281 sname = sname.substr(1);
1283 map<string, int>::iterator it = firstSeqNames.find(sname);
1285 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1286 unsigned long long pos = inQual.tellg();
1287 qfileFilePos.push_back(pos - input.length() - 1);
1288 firstSeqNames.erase(it);
1293 if (firstSeqNames.size() == 0) { break; }
1298 if (firstSeqNames.size() != 0) {
1299 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1300 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1306 //get last file position of qfile
1308 unsigned long long size;
1310 //get num bytes in file
1311 pFile = fopen (qfilename.c_str(),"rb");
1312 if (pFile==NULL) perror ("Error opening file");
1314 fseek (pFile, 0, SEEK_END);
1319 qfileFilePos.push_back(size);
1322 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1323 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); }
1324 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1325 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1327 if(qfilename == "") { qLines = lines; } //files with duds
1333 if (processors == 1) { //save time
1334 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1335 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1336 lines.push_back(linePair(0, 1000));
1337 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1339 int numFastaSeqs = 0;
1340 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1341 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1343 if (qfilename != "") {
1344 int numQualSeqs = 0;
1345 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1347 if (numFastaSeqs != numQualSeqs) {
1348 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1352 //figure out how many sequences you have to process
1353 int numSeqsPerProcessor = numFastaSeqs / processors;
1354 for (int i = 0; i < processors; i++) {
1355 int startIndex = i * numSeqsPerProcessor;
1356 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1357 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1358 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1361 if(qfilename == "") { qLines = lines; } //files with duds
1366 catch(exception& e) {
1367 m->errorOut(e, "TrimSeqsCommand", "setLines");
1372 //***************************************************************************************************************
1374 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1377 m->openInputFile(oligoFile, inOligos);
1381 string type, oligo, group;
1383 int indexPrimer = 0;
1384 int indexBarcode = 0;
1386 while(!inOligos.eof()){
1390 if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
1393 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1394 m->gobble(inOligos);
1397 m->gobble(inOligos);
1398 //make type case insensitive
1399 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1403 if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); }
1405 for(int i=0;i<oligo.length();i++){
1406 oligo[i] = toupper(oligo[i]);
1407 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1410 if(type == "FORWARD"){
1413 // get rest of line in case there is a primer name
1414 while (!inOligos.eof()) {
1415 char c = inOligos.get();
1416 if (c == 10 || c == 13){ break; }
1417 else if (c == 32 || c == 9){;} //space or tab
1418 else { group += c; }
1421 //check for repeat barcodes
1422 map<string, int>::iterator itPrime = primers.find(oligo);
1423 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1425 if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } }
1427 primers[oligo]=indexPrimer; indexPrimer++;
1428 primerNameVector.push_back(group);
1430 else if(type == "REVERSE"){
1431 //Sequence oligoRC("reverse", oligo);
1432 //oligoRC.reverseComplement();
1433 string oligoRC = reverseOligo(oligo);
1434 revPrimer.push_back(oligoRC);
1436 else if(type == "BARCODE"){
1439 //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs
1440 //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info
1442 while (!inOligos.eof()) {
1443 char c = inOligos.get();
1444 if (c == 10 || c == 13){ break; }
1445 else if (c == 32 || c == 9){;} //space or tab
1449 //then this is illumina data with 4 columns
1452 for(int i=0;i<group.length();i++){
1453 group[i] = toupper(group[i]);
1454 if(group[i] == 'U') { group[i] = 'T'; }
1457 if (m->debug) { m->mothurOut("[DEBUG]: reading reverse " + group + ", and group = " + temp + ".\n"); }
1459 string reverseBarcode = reverseOligo(group); //reverse barcode
1460 //check for repeat barcodes
1461 map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
1462 if (itBar != rbarcodes.end()) { m->mothurOut("reverse barcode " + group + " is in your oligos file already."); m->mothurOutEndLine(); }
1465 rbarcodes[reverseBarcode]=indexBarcode;
1466 }else { if (m->debug) { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); } }
1468 //check for repeat barcodes
1469 map<string, int>::iterator itBar = barcodes.find(oligo);
1470 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1472 barcodes[oligo]=indexBarcode; indexBarcode++;
1473 barcodeNameVector.push_back(group);
1474 }else if(type == "LINKER"){
1475 linker.push_back(oligo);
1476 }else if(type == "SPACER"){
1477 spacer.push_back(oligo);
1479 else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1481 m->gobble(inOligos);
1485 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1487 //add in potential combos
1488 if(barcodeNameVector.size() == 0){
1490 barcodeNameVector.push_back("");
1493 if(primerNameVector.size() == 0){
1495 primerNameVector.push_back("");
1498 fastaFileNames.resize(barcodeNameVector.size());
1499 for(int i=0;i<fastaFileNames.size();i++){
1500 fastaFileNames[i].assign(primerNameVector.size(), "");
1502 if(qFileName != "") { qualFileNames = fastaFileNames; }
1503 if(nameFile != "") { nameFileNames = fastaFileNames; }
1506 set<string> uniqueNames; //used to cleanup outputFileNames
1507 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1508 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1510 string primerName = primerNameVector[itPrimer->second];
1511 string barcodeName = barcodeNameVector[itBar->second];
1513 string comboGroupName = "";
1514 string fastaFileName = "";
1515 string qualFileName = "";
1516 string nameFileName = "";
1517 string countFileName = "";
1519 if(primerName == ""){
1520 comboGroupName = barcodeNameVector[itBar->second];
1523 if(barcodeName == ""){
1524 comboGroupName = primerNameVector[itPrimer->second];
1527 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1533 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1534 if (uniqueNames.count(fastaFileName) == 0) {
1535 outputNames.push_back(fastaFileName);
1536 outputTypes["fasta"].push_back(fastaFileName);
1537 uniqueNames.insert(fastaFileName);
1540 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1541 m->openOutputFile(fastaFileName, temp); temp.close();
1543 if(qFileName != ""){
1544 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1545 if (uniqueNames.count(qualFileName) == 0) {
1546 outputNames.push_back(qualFileName);
1547 outputTypes["qfile"].push_back(qualFileName);
1550 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1551 m->openOutputFile(qualFileName, temp); temp.close();
1555 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1556 if (uniqueNames.count(nameFileName) == 0) {
1557 outputNames.push_back(nameFileName);
1558 outputTypes["name"].push_back(nameFileName);
1561 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1562 m->openOutputFile(nameFileName, temp); temp.close();
1567 numFPrimers = primers.size();
1568 numRPrimers = revPrimer.size();
1569 numLinkers = linker.size();
1570 numSpacers = spacer.size();
1572 bool allBlank = true;
1573 for (int i = 0; i < barcodeNameVector.size(); i++) {
1574 if (barcodeNameVector[i] != "") {
1579 for (int i = 0; i < primerNameVector.size(); i++) {
1580 if (primerNameVector[i] != "") {
1587 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1595 catch(exception& e) {
1596 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1600 //***************************************************************************************************************
1602 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1605 if(qscores.getName() != ""){
1606 qscores.trimQScores(-1, keepFirst);
1608 sequence.trim(keepFirst);
1611 catch(exception& e) {
1612 m->errorOut(e, "keepFirstTrim", "countDiffs");
1618 //***************************************************************************************************************
1620 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1624 int length = sequence.getNumBases() - removeLast;
1627 if(qscores.getName() != ""){
1628 qscores.trimQScores(-1, length);
1630 sequence.trim(length);
1639 catch(exception& e) {
1640 m->errorOut(e, "removeLastTrim", "countDiffs");
1646 //***************************************************************************************************************
1648 bool TrimSeqsCommand::cullLength(Sequence& seq){
1651 int length = seq.getNumBases();
1652 bool success = 0; //guilty until proven innocent
1654 if(length >= minLength && maxLength == 0) { success = 1; }
1655 else if(length >= minLength && length <= maxLength) { success = 1; }
1656 else { success = 0; }
1661 catch(exception& e) {
1662 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1668 //***************************************************************************************************************
1670 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1672 int longHomoP = seq.getLongHomoPolymer();
1673 bool success = 0; //guilty until proven innocent
1675 if(longHomoP <= maxHomoP){ success = 1; }
1676 else { success = 0; }
1680 catch(exception& e) {
1681 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1686 //********************************************************************/
1687 string TrimSeqsCommand::reverseOligo(string oligo){
1689 string reverse = "";
1691 for(int i=oligo.length()-1;i>=0;i--){
1693 if(oligo[i] == 'A') { reverse += 'T'; }
1694 else if(oligo[i] == 'T'){ reverse += 'A'; }
1695 else if(oligo[i] == 'U'){ reverse += 'A'; }
1697 else if(oligo[i] == 'G'){ reverse += 'C'; }
1698 else if(oligo[i] == 'C'){ reverse += 'G'; }
1700 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1701 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1703 else if(oligo[i] == 'M'){ reverse += 'K'; }
1704 else if(oligo[i] == 'K'){ reverse += 'M'; }
1706 else if(oligo[i] == 'W'){ reverse += 'W'; }
1707 else if(oligo[i] == 'S'){ reverse += 'S'; }
1709 else if(oligo[i] == 'B'){ reverse += 'V'; }
1710 else if(oligo[i] == 'V'){ reverse += 'B'; }
1712 else if(oligo[i] == 'D'){ reverse += 'H'; }
1713 else if(oligo[i] == 'H'){ reverse += 'D'; }
1715 else { reverse += 'N'; }
1721 catch(exception& e) {
1722 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1727 //***************************************************************************************************************
1729 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1731 int numNs = seq.getAmbigBases();
1732 bool success = 0; //guilty until proven innocent
1734 if(numNs <= maxAmbig) { success = 1; }
1735 else { success = 0; }
1739 catch(exception& e) {
1740 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1745 //***************************************************************************************************************