5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
15 //**********************************************************************************************************************
16 vector<string> TrimSeqsCommand::setParameters(){
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
19 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
20 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
21 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
22 CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
23 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
24 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
25 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
26 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
27 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
28 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
29 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
30 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
31 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
32 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
33 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
34 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
35 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
36 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
37 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
38 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
39 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
40 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
41 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
42 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
43 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
44 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
45 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
46 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
48 vector<string> myArray;
49 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
53 m->errorOut(e, "TrimSeqsCommand", "setParameters");
57 //**********************************************************************************************************************
58 string TrimSeqsCommand::getHelpString(){
60 string helpString = "";
61 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
62 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
63 helpString += "The trim.seqs command parameters are fasta, name, count, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
64 helpString += "The fasta parameter is required.\n";
65 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
66 helpString += "The oligos parameter allows you to provide an oligos file.\n";
67 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
68 helpString += "The count parameter allows you to provide a count file with your fasta file.\n";
69 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
70 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
71 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
72 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
73 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
74 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
75 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
76 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
77 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
78 helpString += "The qfile parameter allows you to provide a quality file.\n";
79 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
80 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
81 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
82 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
83 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
84 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
85 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
86 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
87 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
88 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
89 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
90 helpString += "The trim.seqs command should be in the following format: \n";
91 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
92 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
93 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
94 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
95 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
99 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
103 //**********************************************************************************************************************
104 string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
106 string outputFileName = "";
107 map<string, vector<string> >::iterator it;
109 //is this a type this command creates
110 it = outputTypes.find(type);
111 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
113 if (type == "qfile") { outputFileName = "qual"; }
114 else if (type == "fasta") { outputFileName = "fasta"; }
115 else if (type == "group") { outputFileName = "groups"; }
116 else if (type == "name") { outputFileName = "names"; }
117 else if (type == "count") { outputFileName = "count_table"; }
118 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
120 return outputFileName;
122 catch(exception& e) {
123 m->errorOut(e, "TrimSeqsCommand", "getOutputFileNameTag");
129 //**********************************************************************************************************************
131 TrimSeqsCommand::TrimSeqsCommand(){
133 abort = true; calledHelp = true;
135 vector<string> tempOutNames;
136 outputTypes["fasta"] = tempOutNames;
137 outputTypes["qfile"] = tempOutNames;
138 outputTypes["group"] = tempOutNames;
139 outputTypes["name"] = tempOutNames;
140 outputTypes["count"] = tempOutNames;
142 catch(exception& e) {
143 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
147 //***************************************************************************************************************
149 TrimSeqsCommand::TrimSeqsCommand(string option) {
152 abort = false; calledHelp = false;
155 //allow user to run help
156 if(option == "help") { help(); abort = true; calledHelp = true; }
157 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
160 vector<string> myArray = setParameters();
162 OptionParser parser(option);
163 map<string,string> parameters = parser.getParameters();
165 ValidParameters validParameter;
166 map<string,string>::iterator it;
168 //check to make sure all parameters are valid for command
169 for (it = parameters.begin(); it != parameters.end(); it++) {
170 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
173 //initialize outputTypes
174 vector<string> tempOutNames;
175 outputTypes["fasta"] = tempOutNames;
176 outputTypes["qfile"] = tempOutNames;
177 outputTypes["group"] = tempOutNames;
178 outputTypes["name"] = tempOutNames;
179 outputTypes["count"] = tempOutNames;
181 //if the user changes the input directory command factory will send this info to us in the output parameter
182 string inputDir = validParameter.validFile(parameters, "inputdir", false);
183 if (inputDir == "not found"){ inputDir = ""; }
186 it = parameters.find("fasta");
187 //user has given a template file
188 if(it != parameters.end()){
189 path = m->hasPath(it->second);
190 //if the user has not given a path then, add inputdir. else leave path alone.
191 if (path == "") { parameters["fasta"] = inputDir + it->second; }
194 it = parameters.find("oligos");
195 //user has given a template file
196 if(it != parameters.end()){
197 path = m->hasPath(it->second);
198 //if the user has not given a path then, add inputdir. else leave path alone.
199 if (path == "") { parameters["oligos"] = inputDir + it->second; }
202 it = parameters.find("qfile");
203 //user has given a template file
204 if(it != parameters.end()){
205 path = m->hasPath(it->second);
206 //if the user has not given a path then, add inputdir. else leave path alone.
207 if (path == "") { parameters["qfile"] = inputDir + it->second; }
210 it = parameters.find("name");
211 //user has given a template file
212 if(it != parameters.end()){
213 path = m->hasPath(it->second);
214 //if the user has not given a path then, add inputdir. else leave path alone.
215 if (path == "") { parameters["name"] = inputDir + it->second; }
218 it = parameters.find("count");
219 //user has given a template file
220 if(it != parameters.end()){
221 path = m->hasPath(it->second);
222 //if the user has not given a path then, add inputdir. else leave path alone.
223 if (path == "") { parameters["count"] = inputDir + it->second; }
229 //check for required parameters
230 fastaFile = validParameter.validFile(parameters, "fasta", true);
231 if (fastaFile == "not found") {
232 fastaFile = m->getFastaFile();
233 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
234 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
235 }else if (fastaFile == "not open") { abort = true; }
236 else { m->setFastaFile(fastaFile); }
238 //if the user changes the output directory command factory will send this info to us in the output parameter
239 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
241 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
245 //check for optional parameter and set defaults
246 // ...at some point should added some additional type checking...
248 temp = validParameter.validFile(parameters, "flip", false);
249 if (temp == "not found") { flip = 0; }
250 else { flip = m->isTrue(temp); }
252 temp = validParameter.validFile(parameters, "oligos", true);
253 if (temp == "not found"){ oligoFile = ""; }
254 else if(temp == "not open"){ abort = true; }
255 else { oligoFile = temp; m->setOligosFile(oligoFile); }
258 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
259 m->mothurConvert(temp, maxAmbig);
261 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
262 m->mothurConvert(temp, maxHomoP);
264 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
265 m->mothurConvert(temp, minLength);
267 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
268 m->mothurConvert(temp, maxLength);
270 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
271 m->mothurConvert(temp, bdiffs);
273 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
274 m->mothurConvert(temp, pdiffs);
276 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
277 m->mothurConvert(temp, ldiffs);
279 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
280 m->mothurConvert(temp, sdiffs);
282 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
283 m->mothurConvert(temp, tdiffs);
285 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
287 temp = validParameter.validFile(parameters, "qfile", true);
288 if (temp == "not found") { qFileName = ""; }
289 else if(temp == "not open") { abort = true; }
290 else { qFileName = temp; m->setQualFile(qFileName); }
292 temp = validParameter.validFile(parameters, "name", true);
293 if (temp == "not found") { nameFile = ""; }
294 else if(temp == "not open") { nameFile = ""; abort = true; }
295 else { nameFile = temp; m->setNameFile(nameFile); }
297 countfile = validParameter.validFile(parameters, "count", true);
298 if (countfile == "not open") { abort = true; countfile = ""; }
299 else if (countfile == "not found") { countfile = ""; }
300 else { m->setCountTableFile(countfile); }
302 if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
304 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
305 m->mothurConvert(temp, qThreshold);
307 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
308 qtrim = m->isTrue(temp);
310 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
311 convert(temp, qRollAverage);
313 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
314 convert(temp, qWindowAverage);
316 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
317 convert(temp, qWindowSize);
319 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
320 convert(temp, qWindowStep);
322 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
323 convert(temp, qAverage);
325 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
326 convert(temp, keepFirst);
328 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
329 convert(temp, removeLast);
331 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
332 allFiles = m->isTrue(temp);
334 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
335 keepforward = m->isTrue(temp);
337 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
338 m->setProcessors(temp);
339 m->mothurConvert(temp, processors);
342 if(allFiles && (oligoFile == "")){
343 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
345 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
346 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
350 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
351 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
355 if (countfile == "") {
356 if (nameFile == "") {
357 vector<string> files; files.push_back(fastaFile);
358 parser.getNameFile(files);
364 catch(exception& e) {
365 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
369 //***************************************************************************************************************
371 int TrimSeqsCommand::execute(){
374 if (abort == true) { if (calledHelp) { return 0; } return 2; }
376 numFPrimers = 0; //this needs to be initialized
381 vector<vector<string> > fastaFileNames;
382 vector<vector<string> > qualFileNames;
383 vector<vector<string> > nameFileNames;
385 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("fasta");
386 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
388 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("fasta");
389 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
391 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("qfile");
392 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("qfile");
394 if (qFileName != "") {
395 outputNames.push_back(trimQualFile);
396 outputNames.push_back(scrapQualFile);
397 outputTypes["qfile"].push_back(trimQualFile);
398 outputTypes["qfile"].push_back(scrapQualFile);
401 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim." + getOutputFileNameTag("name");
402 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap." + getOutputFileNameTag("name");
404 if (nameFile != "") {
405 m->readNames(nameFile, nameMap);
406 outputNames.push_back(trimNameFile);
407 outputNames.push_back(scrapNameFile);
408 outputTypes["name"].push_back(trimNameFile);
409 outputTypes["name"].push_back(scrapNameFile);
412 string trimCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "trim." + getOutputFileNameTag("count");
413 string scrapCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "scrap." + getOutputFileNameTag("count");
415 if (countfile != "") {
417 ct.readTable(countfile);
418 nameCount = ct.getNameMap();
419 outputNames.push_back(trimCountFile);
420 outputNames.push_back(scrapCountFile);
421 outputTypes["count"].push_back(trimCountFile);
422 outputTypes["count"].push_back(scrapCountFile);
426 if (m->control_pressed) { return 0; }
428 string outputGroupFileName;
430 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
431 if ((createGroup) && (countfile == "")){
432 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group");
433 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
437 //fills lines and qlines
438 setLines(fastaFile, qFileName);
441 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
443 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
447 if (m->control_pressed) { return 0; }
450 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
451 map<string, string>::iterator it;
452 set<string> namesToRemove;
453 for(int i=0;i<fastaFileNames.size();i++){
454 for(int j=0;j<fastaFileNames[0].size();j++){
455 if (fastaFileNames[i][j] != "") {
456 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
457 if(m->isBlank(fastaFileNames[i][j])){
458 m->mothurRemove(fastaFileNames[i][j]);
459 namesToRemove.insert(fastaFileNames[i][j]);
462 m->mothurRemove(qualFileNames[i][j]);
463 namesToRemove.insert(qualFileNames[i][j]);
467 m->mothurRemove(nameFileNames[i][j]);
468 namesToRemove.insert(nameFileNames[i][j]);
471 it = uniqueFastaNames.find(fastaFileNames[i][j]);
472 if (it == uniqueFastaNames.end()) {
473 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
481 //remove names for outputFileNames, just cleans up the output
482 vector<string> outputNames2;
483 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
484 outputNames = outputNames2;
486 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
488 m->openInputFile(it->first, in);
491 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first));
492 if (countfile == "") { thisGroupName += getOutputFileNameTag("group"); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); }
493 else { thisGroupName += getOutputFileNameTag("count"); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); }
494 m->openOutputFile(thisGroupName, out);
496 if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; }
499 if (m->control_pressed) { break; }
501 Sequence currSeq(in); m->gobble(in);
502 if (countfile == "") {
503 out << currSeq.getName() << '\t' << it->second << endl;
505 if (nameFile != "") {
506 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
507 if (itName != nameMap.end()) {
508 vector<string> thisSeqsNames;
509 m->splitAtChar(itName->second, thisSeqsNames, ',');
510 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
511 out << thisSeqsNames[k] << '\t' << it->second << endl;
513 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
516 map<string, int>::iterator itTotalReps = nameCount.find(currSeq.getName());
517 if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; }
518 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
525 if (countfile != "") { //create countfile with group info included
526 CountTable* ct = new CountTable();
527 ct->readTable(trimCountFile);
528 map<string, int> justTrimmedNames = ct->getNameMap();
532 for (map<string, int>::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); }
533 vector<int> tempCounts; tempCounts.resize(groupCounts.size(), 0);
534 for (map<string, int>::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) {
535 newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance
536 map<string, string>::iterator it2 = groupMap.find(itNames->first);
537 if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); }
538 else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; }
540 newCt.printTable(trimCountFile);
544 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
546 //output group counts
547 m->mothurOutEndLine();
549 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
550 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
551 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
553 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
555 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
557 //set fasta file as new current fastafile
559 itTypes = outputTypes.find("fasta");
560 if (itTypes != outputTypes.end()) {
561 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
564 itTypes = outputTypes.find("name");
565 if (itTypes != outputTypes.end()) {
566 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
569 itTypes = outputTypes.find("qfile");
570 if (itTypes != outputTypes.end()) {
571 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
574 itTypes = outputTypes.find("group");
575 if (itTypes != outputTypes.end()) {
576 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
579 itTypes = outputTypes.find("count");
580 if (itTypes != outputTypes.end()) {
581 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
584 m->mothurOutEndLine();
585 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
586 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
587 m->mothurOutEndLine();
592 catch(exception& e) {
593 m->errorOut(e, "TrimSeqsCommand", "execute");
598 /**************************************************************************************/
599 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
603 ofstream trimFASTAFile;
604 m->openOutputFile(trimFileName, trimFASTAFile);
606 ofstream scrapFASTAFile;
607 m->openOutputFile(scrapFileName, scrapFASTAFile);
609 ofstream trimQualFile;
610 ofstream scrapQualFile;
612 m->openOutputFile(trimQFileName, trimQualFile);
613 m->openOutputFile(scrapQFileName, scrapQualFile);
616 ofstream trimNameFile;
617 ofstream scrapNameFile;
619 m->openOutputFile(trimNFileName, trimNameFile);
620 m->openOutputFile(scrapNFileName, scrapNameFile);
623 ofstream trimCountFile;
624 ofstream scrapCountFile;
626 m->openOutputFile(trimCFileName, trimCountFile);
627 m->openOutputFile(scrapCFileName, scrapCountFile);
628 if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
631 ofstream outGroupsFile;
632 if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); }
634 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
635 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
636 if (fastaFileNames[i][j] != "") {
638 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
640 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
644 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
652 m->openInputFile(filename, inFASTA);
653 inFASTA.seekg(line.start);
656 if(qFileName != "") {
657 m->openInputFile(qFileName, qFile);
658 qFile.seekg(qline.start);
663 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
667 if (m->control_pressed) {
668 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
669 if ((createGroup) && (countfile == "")) { outGroupsFile.close(); }
670 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
671 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
672 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
673 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;
677 string trashCode = "";
678 int currentSeqsDiffs = 0;
680 Sequence currSeq(inFASTA); m->gobble(inFASTA);
681 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
683 QualityScores currQual;
685 currQual = QualityScores(qFile); m->gobble(qFile);
686 //cout << currQual.getName() << endl;
689 string origSeq = currSeq.getUnaligned();
692 int barcodeIndex = 0;
696 success = trimOligos.stripLinker(currSeq, currQual);
697 if(success > ldiffs) { trashCode += 'k'; }
698 else{ currentSeqsDiffs += success; }
702 if(barcodes.size() != 0){
703 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
704 if(success > bdiffs) { trashCode += 'b'; }
705 else{ currentSeqsDiffs += success; }
709 success = trimOligos.stripSpacer(currSeq, currQual);
710 if(success > sdiffs) { trashCode += 's'; }
711 else{ currentSeqsDiffs += success; }
715 if(numFPrimers != 0){
716 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
717 if(success > pdiffs) { trashCode += 'f'; }
718 else{ currentSeqsDiffs += success; }
721 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
723 if(numRPrimers != 0){
724 success = trimOligos.stripReverse(currSeq, currQual);
725 if(!success) { trashCode += 'r'; }
729 success = keepFirstTrim(currSeq, currQual);
733 success = removeLastTrim(currSeq, currQual);
734 if(!success) { trashCode += 'l'; }
739 int origLength = currSeq.getNumBases();
741 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
742 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
743 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
744 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
745 else { success = 1; }
747 //you don't want to trim, if it fails above then scrap it
748 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
750 if(!success) { trashCode += 'q'; }
753 if(minLength > 0 || maxLength > 0){
754 success = cullLength(currSeq);
755 if(!success) { trashCode += 'l'; }
758 success = cullHomoP(currSeq);
759 if(!success) { trashCode += 'h'; }
762 success = cullAmbigs(currSeq);
763 if(!success) { trashCode += 'n'; }
766 if(flip){ // should go last
767 currSeq.reverseComplement();
769 currQual.flipQScores();
773 if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } }
775 if(trashCode.length() == 0){
776 currSeq.setAligned(currSeq.getUnaligned());
777 currSeq.printSequence(trimFASTAFile);
780 currQual.printQScores(trimQualFile);
785 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
786 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
787 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
790 int numRedundants = 0;
791 if (countfile != "") {
792 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
793 if (itCount != nameCount.end()) {
794 trimCountFile << itCount->first << '\t' << itCount->second << endl;
795 numRedundants = itCount->second-1;
796 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
800 if(barcodes.size() != 0){
801 string thisGroup = barcodeNameVector[barcodeIndex];
802 if (primers.size() != 0) {
803 if (primerNameVector[primerIndex] != "") {
804 if(thisGroup != "") {
805 thisGroup += "." + primerNameVector[primerIndex];
807 thisGroup = primerNameVector[primerIndex];
812 if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
814 if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
815 else { groupMap[currSeq.getName()] = thisGroup; }
817 if (nameFile != "") {
818 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
819 if (itName != nameMap.end()) {
820 vector<string> thisSeqsNames;
821 m->splitAtChar(itName->second, thisSeqsNames, ',');
822 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
823 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
824 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
826 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
829 map<string, int>::iterator it = groupCounts.find(thisGroup);
830 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
831 else { groupCounts[it->first] += (1 + numRedundants); }
838 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
839 currSeq.printSequence(output);
843 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
844 currQual.printQScores(output);
849 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
850 if (itName != nameMap.end()) {
851 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
852 output << itName->first << '\t' << itName->second << endl;
854 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
859 if(nameFile != ""){ //needs to be before the currSeq name is changed
860 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
861 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
862 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
864 if (countfile != "") {
865 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
866 if (itCount != nameCount.end()) {
867 trimCountFile << itCount->first << '\t' << itCount->second << endl;
868 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
871 currSeq.setName(currSeq.getName() + '|' + trashCode);
872 currSeq.setUnaligned(origSeq);
873 currSeq.setAligned(origSeq);
874 currSeq.printSequence(scrapFASTAFile);
876 currQual.printQScores(scrapQualFile);
882 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
883 unsigned long long pos = inFASTA.tellg();
884 if ((pos == -1) || (pos >= line.end)) { break; }
887 if (inFASTA.eof()) { break; }
891 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
895 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
899 trimFASTAFile.close();
900 scrapFASTAFile.close();
901 if (createGroup) { outGroupsFile.close(); }
902 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
903 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
904 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
908 catch(exception& e) {
909 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
914 /**************************************************************************************************/
916 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
923 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
924 //loop through and create all the processes you want
925 while (process != processors) {
929 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
933 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
934 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
935 vector<vector<string> > tempNameFileNames = nameFileNames;
940 for(int i=0;i<tempFASTAFileNames.size();i++){
941 for(int j=0;j<tempFASTAFileNames[i].size();j++){
942 if (tempFASTAFileNames[i][j] != "") {
943 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
944 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
947 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
948 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
951 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
952 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
959 driverCreateTrim(filename,
961 (trimFASTAFileName + toString(getpid()) + ".temp"),
962 (scrapFASTAFileName + toString(getpid()) + ".temp"),
963 (trimQualFileName + toString(getpid()) + ".temp"),
964 (scrapQualFileName + toString(getpid()) + ".temp"),
965 (trimNameFileName + toString(getpid()) + ".temp"),
966 (scrapNameFileName + toString(getpid()) + ".temp"),
967 (trimCountFileName + toString(getpid()) + ".temp"),
968 (scrapCountFileName + toString(getpid()) + ".temp"),
969 (groupFile + toString(getpid()) + ".temp"),
971 tempPrimerQualFileNames,
976 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
978 //pass groupCounts to parent
981 string tempFile = filename + toString(getpid()) + ".num.temp";
982 m->openOutputFile(tempFile, out);
984 out << groupCounts.size() << endl;
986 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
987 out << it->first << '\t' << it->second << endl;
990 out << groupMap.size() << endl;
991 for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
992 out << it->first << '\t' << it->second << endl;
998 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
999 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1006 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1007 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1008 if(qFileName != ""){
1009 m->openOutputFile(trimQualFileName, temp); temp.close();
1010 m->openOutputFile(scrapQualFileName, temp); temp.close();
1012 if (nameFile != "") {
1013 m->openOutputFile(trimNameFileName, temp); temp.close();
1014 m->openOutputFile(scrapNameFileName, temp); temp.close();
1016 if (countfile != "") {
1017 m->openOutputFile(trimCountFileName, temp); temp.close();
1018 m->openOutputFile(scrapCountFileName, temp); temp.close();
1021 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
1023 //force parent to wait until all the processes are done
1024 for (int i=0;i<processIDS.size();i++) {
1025 int temp = processIDS[i];
1029 //////////////////////////////////////////////////////////////////////////////////////////////////////
1030 //Windows version shared memory, so be careful when passing variables through the trimData struct.
1031 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1032 //////////////////////////////////////////////////////////////////////////////////////////////////////
1034 vector<trimData*> pDataArray;
1035 DWORD dwThreadIdArray[processors-1];
1036 HANDLE hThreadArray[processors-1];
1038 //Create processor worker threads.
1039 for( int i=0; i<processors-1; i++){
1041 string extension = "";
1042 if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
1043 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
1044 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
1045 vector<vector<string> > tempNameFileNames = nameFileNames;
1050 for(int i=0;i<tempFASTAFileNames.size();i++){
1051 for(int j=0;j<tempFASTAFileNames[i].size();j++){
1052 if (tempFASTAFileNames[i][j] != "") {
1053 tempFASTAFileNames[i][j] += extension;
1054 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
1056 if(qFileName != ""){
1057 tempPrimerQualFileNames[i][j] += extension;
1058 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
1061 tempNameFileNames[i][j] += extension;
1062 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
1070 trimData* tempTrim = new trimData(filename,
1071 qFileName, nameFile, countfile,
1072 (trimFASTAFileName+extension),
1073 (scrapFASTAFileName+extension),
1074 (trimQualFileName+extension),
1075 (scrapQualFileName+extension),
1076 (trimNameFileName+extension),
1077 (scrapNameFileName+extension),
1078 (trimCountFileName+extension),
1079 (scrapCountFileName+extension),
1080 (groupFile+extension),
1082 tempPrimerQualFileNames,
1084 lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
1085 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
1086 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
1087 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
1088 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount);
1089 pDataArray.push_back(tempTrim);
1091 hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
1096 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1097 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1098 if(qFileName != ""){
1099 m->openOutputFile(trimQualFileName, temp); temp.close();
1100 m->openOutputFile(scrapQualFileName, temp); temp.close();
1102 if (nameFile != "") {
1103 m->openOutputFile(trimNameFileName, temp); temp.close();
1104 m->openOutputFile(scrapNameFileName, temp); temp.close();
1107 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
1108 processIDS.push_back(processors-1);
1111 //Wait until all threads have terminated.
1112 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1114 //Close all thread handles and free memory allocations.
1115 for(int i=0; i < pDataArray.size(); i++){
1116 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
1117 map<string, int>::iterator it2 = groupCounts.find(it->first);
1118 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
1119 else { groupCounts[it->first] += it->second; }
1121 for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
1122 map<string, string>::iterator it2 = groupMap.find(it->first);
1123 if (it2 == groupMap.end()) { groupMap[it->first] = it->second; }
1124 else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1126 CloseHandle(hThreadArray[i]);
1127 delete pDataArray[i];
1134 for(int i=0;i<processIDS.size();i++){
1136 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1138 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
1139 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
1140 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
1141 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
1143 if(qFileName != ""){
1144 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
1145 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
1146 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
1147 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
1151 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
1152 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
1153 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1154 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1157 if(countfile != ""){
1158 m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName);
1159 m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp"));
1160 m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName);
1161 m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp"));
1164 if((createGroup)&&(countfile == "")){
1165 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1166 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1171 for(int j=0;j<fastaFileNames.size();j++){
1172 for(int k=0;k<fastaFileNames[j].size();k++){
1173 if (fastaFileNames[j][k] != "") {
1174 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1175 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1177 if(qFileName != ""){
1178 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1179 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1183 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1184 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1191 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1194 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1195 m->openInputFile(tempFile, in);
1199 in >> tempNum; m->gobble(in);
1202 for (int i = 0; i < tempNum; i++) {
1204 in >> group >> groupNum; m->gobble(in);
1206 map<string, int>::iterator it = groupCounts.find(group);
1207 if (it == groupCounts.end()) { groupCounts[group] = groupNum; }
1208 else { groupCounts[it->first] += groupNum; }
1211 in >> tempNum; m->gobble(in);
1213 for (int i = 0; i < tempNum; i++) {
1214 string group, seqName;
1215 in >> seqName >> group; m->gobble(in);
1217 map<string, string>::iterator it = groupMap.find(seqName);
1218 if (it == groupMap.end()) { groupMap[seqName] = group; }
1219 else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1223 in.close(); m->mothurRemove(tempFile);
1230 catch(exception& e) {
1231 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1236 /**************************************************************************************************/
1238 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1241 vector<unsigned long long> fastaFilePos;
1242 vector<unsigned long long> qfileFilePos;
1244 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1245 //set file positions for fasta file
1246 fastaFilePos = m->divideFile(filename, processors);
1248 //get name of first sequence in each chunk
1249 map<string, int> firstSeqNames;
1250 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1252 m->openInputFile(filename, in);
1253 in.seekg(fastaFilePos[i]);
1256 firstSeqNames[temp.getName()] = i;
1261 if(qfilename != "") {
1262 //seach for filePos of each first name in the qfile and save in qfileFilePos
1264 m->openInputFile(qfilename, inQual);
1267 while(!inQual.eof()){
1268 input = m->getline(inQual);
1270 if (input.length() != 0) {
1271 if(input[0] == '>'){ //this is a sequence name line
1272 istringstream nameStream(input);
1274 string sname = ""; nameStream >> sname;
1275 sname = sname.substr(1);
1277 map<string, int>::iterator it = firstSeqNames.find(sname);
1279 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1280 unsigned long long pos = inQual.tellg();
1281 qfileFilePos.push_back(pos - input.length() - 1);
1282 firstSeqNames.erase(it);
1287 if (firstSeqNames.size() == 0) { break; }
1292 if (firstSeqNames.size() != 0) {
1293 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1294 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1300 //get last file position of qfile
1302 unsigned long long size;
1304 //get num bytes in file
1305 pFile = fopen (qfilename.c_str(),"rb");
1306 if (pFile==NULL) perror ("Error opening file");
1308 fseek (pFile, 0, SEEK_END);
1313 qfileFilePos.push_back(size);
1316 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1317 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); }
1318 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1319 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1321 if(qfilename == "") { qLines = lines; } //files with duds
1327 if (processors == 1) { //save time
1328 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1329 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1330 lines.push_back(linePair(0, 1000));
1331 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1333 int numFastaSeqs = 0;
1334 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1335 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1337 if (qfilename != "") {
1338 int numQualSeqs = 0;
1339 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1341 if (numFastaSeqs != numQualSeqs) {
1342 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1346 //figure out how many sequences you have to process
1347 int numSeqsPerProcessor = numFastaSeqs / processors;
1348 for (int i = 0; i < processors; i++) {
1349 int startIndex = i * numSeqsPerProcessor;
1350 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1351 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1352 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1355 if(qfilename == "") { qLines = lines; } //files with duds
1360 catch(exception& e) {
1361 m->errorOut(e, "TrimSeqsCommand", "setLines");
1366 //***************************************************************************************************************
1368 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1371 m->openInputFile(oligoFile, inOligos);
1375 string type, oligo, group;
1377 int indexPrimer = 0;
1378 int indexBarcode = 0;
1380 while(!inOligos.eof()){
1384 if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
1387 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1388 m->gobble(inOligos);
1391 m->gobble(inOligos);
1392 //make type case insensitive
1393 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1397 if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); }
1399 for(int i=0;i<oligo.length();i++){
1400 oligo[i] = toupper(oligo[i]);
1401 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1404 if(type == "FORWARD"){
1407 // get rest of line in case there is a primer name
1408 while (!inOligos.eof()) {
1409 char c = inOligos.get();
1410 if (c == 10 || c == 13){ break; }
1411 else if (c == 32 || c == 9){;} //space or tab
1412 else { group += c; }
1415 //check for repeat barcodes
1416 map<string, int>::iterator itPrime = primers.find(oligo);
1417 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1419 if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } }
1421 primers[oligo]=indexPrimer; indexPrimer++;
1422 primerNameVector.push_back(group);
1424 else if(type == "REVERSE"){
1425 //Sequence oligoRC("reverse", oligo);
1426 //oligoRC.reverseComplement();
1427 string oligoRC = reverseOligo(oligo);
1428 revPrimer.push_back(oligoRC);
1430 else if(type == "BARCODE"){
1433 //check for repeat barcodes
1434 map<string, int>::iterator itBar = barcodes.find(oligo);
1435 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1437 barcodes[oligo]=indexBarcode; indexBarcode++;
1438 barcodeNameVector.push_back(group);
1439 }else if(type == "LINKER"){
1440 linker.push_back(oligo);
1441 }else if(type == "SPACER"){
1442 spacer.push_back(oligo);
1444 else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1446 m->gobble(inOligos);
1450 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1452 //add in potential combos
1453 if(barcodeNameVector.size() == 0){
1455 barcodeNameVector.push_back("");
1458 if(primerNameVector.size() == 0){
1460 primerNameVector.push_back("");
1463 fastaFileNames.resize(barcodeNameVector.size());
1464 for(int i=0;i<fastaFileNames.size();i++){
1465 fastaFileNames[i].assign(primerNameVector.size(), "");
1467 if(qFileName != "") { qualFileNames = fastaFileNames; }
1468 if(nameFile != "") { nameFileNames = fastaFileNames; }
1471 set<string> uniqueNames; //used to cleanup outputFileNames
1472 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1473 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1475 string primerName = primerNameVector[itPrimer->second];
1476 string barcodeName = barcodeNameVector[itBar->second];
1478 string comboGroupName = "";
1479 string fastaFileName = "";
1480 string qualFileName = "";
1481 string nameFileName = "";
1482 string countFileName = "";
1484 if(primerName == ""){
1485 comboGroupName = barcodeNameVector[itBar->second];
1488 if(barcodeName == ""){
1489 comboGroupName = primerNameVector[itPrimer->second];
1492 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1498 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1499 if (uniqueNames.count(fastaFileName) == 0) {
1500 outputNames.push_back(fastaFileName);
1501 outputTypes["fasta"].push_back(fastaFileName);
1502 uniqueNames.insert(fastaFileName);
1505 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1506 m->openOutputFile(fastaFileName, temp); temp.close();
1508 if(qFileName != ""){
1509 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1510 if (uniqueNames.count(qualFileName) == 0) {
1511 outputNames.push_back(qualFileName);
1512 outputTypes["qfile"].push_back(qualFileName);
1515 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1516 m->openOutputFile(qualFileName, temp); temp.close();
1520 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1521 if (uniqueNames.count(nameFileName) == 0) {
1522 outputNames.push_back(nameFileName);
1523 outputTypes["name"].push_back(nameFileName);
1526 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1527 m->openOutputFile(nameFileName, temp); temp.close();
1532 numFPrimers = primers.size();
1533 numRPrimers = revPrimer.size();
1534 numLinkers = linker.size();
1535 numSpacers = spacer.size();
1537 bool allBlank = true;
1538 for (int i = 0; i < barcodeNameVector.size(); i++) {
1539 if (barcodeNameVector[i] != "") {
1544 for (int i = 0; i < primerNameVector.size(); i++) {
1545 if (primerNameVector[i] != "") {
1552 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1560 catch(exception& e) {
1561 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1565 //***************************************************************************************************************
1567 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1570 if(qscores.getName() != ""){
1571 qscores.trimQScores(-1, keepFirst);
1573 sequence.trim(keepFirst);
1576 catch(exception& e) {
1577 m->errorOut(e, "keepFirstTrim", "countDiffs");
1583 //***************************************************************************************************************
1585 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1589 int length = sequence.getNumBases() - removeLast;
1592 if(qscores.getName() != ""){
1593 qscores.trimQScores(-1, length);
1595 sequence.trim(length);
1604 catch(exception& e) {
1605 m->errorOut(e, "removeLastTrim", "countDiffs");
1611 //***************************************************************************************************************
1613 bool TrimSeqsCommand::cullLength(Sequence& seq){
1616 int length = seq.getNumBases();
1617 bool success = 0; //guilty until proven innocent
1619 if(length >= minLength && maxLength == 0) { success = 1; }
1620 else if(length >= minLength && length <= maxLength) { success = 1; }
1621 else { success = 0; }
1626 catch(exception& e) {
1627 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1633 //***************************************************************************************************************
1635 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1637 int longHomoP = seq.getLongHomoPolymer();
1638 bool success = 0; //guilty until proven innocent
1640 if(longHomoP <= maxHomoP){ success = 1; }
1641 else { success = 0; }
1645 catch(exception& e) {
1646 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1651 //********************************************************************/
1652 string TrimSeqsCommand::reverseOligo(string oligo){
1654 string reverse = "";
1656 for(int i=oligo.length()-1;i>=0;i--){
1658 if(oligo[i] == 'A') { reverse += 'T'; }
1659 else if(oligo[i] == 'T'){ reverse += 'A'; }
1660 else if(oligo[i] == 'U'){ reverse += 'A'; }
1662 else if(oligo[i] == 'G'){ reverse += 'C'; }
1663 else if(oligo[i] == 'C'){ reverse += 'G'; }
1665 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1666 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1668 else if(oligo[i] == 'M'){ reverse += 'K'; }
1669 else if(oligo[i] == 'K'){ reverse += 'M'; }
1671 else if(oligo[i] == 'W'){ reverse += 'W'; }
1672 else if(oligo[i] == 'S'){ reverse += 'S'; }
1674 else if(oligo[i] == 'B'){ reverse += 'V'; }
1675 else if(oligo[i] == 'V'){ reverse += 'B'; }
1677 else if(oligo[i] == 'D'){ reverse += 'H'; }
1678 else if(oligo[i] == 'H'){ reverse += 'D'; }
1680 else { reverse += 'N'; }
1686 catch(exception& e) {
1687 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1692 //***************************************************************************************************************
1694 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1696 int numNs = seq.getAmbigBases();
1697 bool success = 0; //guilty until proven innocent
1699 if(numNs <= maxAmbig) { success = 1; }
1700 else { success = 0; }
1704 catch(exception& e) {
1705 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1710 //***************************************************************************************************************