5 * Created by Sarah Westcott on 7/8/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getseqscommand.h"
11 #include "sequence.hpp"
12 #include "listvector.hpp"
13 #include "counttable.h"
15 //**********************************************************************************************************************
16 vector<string> GetSeqsCommand::setParameters(){
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none","fasta",false,false,true); parameters.push_back(pfasta);
19 CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "FNGLT", "none","fastq",false,false,true); parameters.push_back(pfastq);
20 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none","name",false,false,true); parameters.push_back(pname);
21 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none","count",false,false,true); parameters.push_back(pcount);
22 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none","group",false,false,true); parameters.push_back(pgroup);
23 CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none","list",false,false,true); parameters.push_back(plist);
24 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none","taxonomy",false,false,true); parameters.push_back(ptaxonomy);
25 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none","alignreport",false,false); parameters.push_back(palignreport);
26 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "FNGLT", "none","qfile",false,false); parameters.push_back(pqfile);
27 CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(paccnos);
28 CommandParameter pdups("dups", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pdups);
29 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
30 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
31 CommandParameter paccnos2("accnos2", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(paccnos2);
33 vector<string> myArray;
34 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
38 m->errorOut(e, "GetSeqsCommand", "setParameters");
42 //**********************************************************************************************************************
43 string GetSeqsCommand::getHelpString(){
45 string helpString = "";
46 helpString += "The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, count, list, taxonomy, quality, fastq or alignreport file.\n";
47 helpString += "It outputs a file containing only the sequences in the .accnos file.\n";
48 helpString += "The get.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport, fastq and dups. You must provide accnos unless you have a valid current accnos file, and at least one of the other parameters.\n";
49 helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=true. \n";
50 helpString += "The get.seqs command should be in the following format: get.seqs(accnos=yourAccnos, fasta=yourFasta).\n";
51 helpString += "Example get.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n";
52 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
56 m->errorOut(e, "GetSeqsCommand", "getHelpString");
61 //**********************************************************************************************************************
62 GetSeqsCommand::GetSeqsCommand(){
64 abort = true; calledHelp = true;
66 vector<string> tempOutNames;
67 outputTypes["fasta"] = tempOutNames;
68 outputTypes["fastq"] = tempOutNames;
69 outputTypes["taxonomy"] = tempOutNames;
70 outputTypes["name"] = tempOutNames;
71 outputTypes["group"] = tempOutNames;
72 outputTypes["alignreport"] = tempOutNames;
73 outputTypes["list"] = tempOutNames;
74 outputTypes["qfile"] = tempOutNames;
75 outputTypes["count"] = tempOutNames;
76 outputTypes["accnosreport"] = tempOutNames;
79 m->errorOut(e, "GetSeqsCommand", "GetSeqsCommand");
83 //**********************************************************************************************************************
84 string GetSeqsCommand::getOutputPattern(string type) {
88 if (type == "fasta") { pattern = "[filename],pick,[extension]"; }
89 else if (type == "fastq") { pattern = "[filename],pick,[extension]"; }
90 else if (type == "taxonomy") { pattern = "[filename],pick,[extension]"; }
91 else if (type == "name") { pattern = "[filename],pick,[extension]"; }
92 else if (type == "group") { pattern = "[filename],pick,[extension]"; }
93 else if (type == "count") { pattern = "[filename],pick,[extension]"; }
94 else if (type == "list") { pattern = "[filename],[distance],pick,[extension]"; }
95 else if (type == "qfile") { pattern = "[filename],pick,[extension]"; }
96 else if (type == "accnosreport") { pattern = "[filename],pick.accnos.report"; }
97 else if (type == "alignreport") { pattern = "[filename],pick.align.report"; }
98 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
102 catch(exception& e) {
103 m->errorOut(e, "GetSeqsCommand", "getOutputPattern");
107 //**********************************************************************************************************************
108 GetSeqsCommand::GetSeqsCommand(string option) {
110 abort = false; calledHelp = false;
112 //allow user to run help
113 if(option == "help") { help(); abort = true; calledHelp = true; }
114 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
117 vector<string> myArray = setParameters();
119 OptionParser parser(option);
120 map<string,string> parameters = parser.getParameters();
122 ValidParameters validParameter;
123 map<string,string>::iterator it;
125 //check to make sure all parameters are valid for command
126 for (it = parameters.begin(); it != parameters.end(); it++) {
127 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
130 //initialize outputTypes
131 vector<string> tempOutNames;
132 outputTypes["fasta"] = tempOutNames;
133 outputTypes["fastq"] = tempOutNames;
134 outputTypes["taxonomy"] = tempOutNames;
135 outputTypes["name"] = tempOutNames;
136 outputTypes["group"] = tempOutNames;
137 outputTypes["alignreport"] = tempOutNames;
138 outputTypes["list"] = tempOutNames;
139 outputTypes["qfile"] = tempOutNames;
140 outputTypes["accnosreport"] = tempOutNames;
141 outputTypes["count"] = tempOutNames;
143 //if the user changes the output directory command factory will send this info to us in the output parameter
144 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
146 //if the user changes the input directory command factory will send this info to us in the output parameter
147 string inputDir = validParameter.validFile(parameters, "inputdir", false);
148 if (inputDir == "not found"){ inputDir = ""; }
151 it = parameters.find("alignreport");
152 //user has given a template file
153 if(it != parameters.end()){
154 path = m->hasPath(it->second);
155 //if the user has not given a path then, add inputdir. else leave path alone.
156 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
159 it = parameters.find("fasta");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["fasta"] = inputDir + it->second; }
167 it = parameters.find("accnos");
168 //user has given a template file
169 if(it != parameters.end()){
170 path = m->hasPath(it->second);
171 //if the user has not given a path then, add inputdir. else leave path alone.
172 if (path == "") { parameters["accnos"] = inputDir + it->second; }
175 it = parameters.find("accnos2");
176 //user has given a template file
177 if(it != parameters.end()){
178 path = m->hasPath(it->second);
179 //if the user has not given a path then, add inputdir. else leave path alone.
180 if (path == "") { parameters["accnos2"] = inputDir + it->second; }
183 it = parameters.find("list");
184 //user has given a template file
185 if(it != parameters.end()){
186 path = m->hasPath(it->second);
187 //if the user has not given a path then, add inputdir. else leave path alone.
188 if (path == "") { parameters["list"] = inputDir + it->second; }
191 it = parameters.find("name");
192 //user has given a template file
193 if(it != parameters.end()){
194 path = m->hasPath(it->second);
195 //if the user has not given a path then, add inputdir. else leave path alone.
196 if (path == "") { parameters["name"] = inputDir + it->second; }
199 it = parameters.find("group");
200 //user has given a template file
201 if(it != parameters.end()){
202 path = m->hasPath(it->second);
203 //if the user has not given a path then, add inputdir. else leave path alone.
204 if (path == "") { parameters["group"] = inputDir + it->second; }
207 it = parameters.find("taxonomy");
208 //user has given a template file
209 if(it != parameters.end()){
210 path = m->hasPath(it->second);
211 //if the user has not given a path then, add inputdir. else leave path alone.
212 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
215 it = parameters.find("qfile");
216 //user has given a template file
217 if(it != parameters.end()){
218 path = m->hasPath(it->second);
219 //if the user has not given a path then, add inputdir. else leave path alone.
220 if (path == "") { parameters["qfile"] = inputDir + it->second; }
223 it = parameters.find("count");
224 //user has given a template file
225 if(it != parameters.end()){
226 path = m->hasPath(it->second);
227 //if the user has not given a path then, add inputdir. else leave path alone.
228 if (path == "") { parameters["count"] = inputDir + it->second; }
231 it = parameters.find("fastq");
232 //user has given a template file
233 if(it != parameters.end()){
234 path = m->hasPath(it->second);
235 //if the user has not given a path then, add inputdir. else leave path alone.
236 if (path == "") { parameters["fastq"] = inputDir + it->second; }
241 //check for required parameters
242 accnosfile = validParameter.validFile(parameters, "accnos", true);
243 if (accnosfile == "not open") { abort = true; }
244 else if (accnosfile == "not found") {
245 accnosfile = m->getAccnosFile();
246 if (accnosfile != "") { m->mothurOut("Using " + accnosfile + " as input file for the accnos parameter."); m->mothurOutEndLine(); }
248 m->mothurOut("You have no valid accnos file and accnos is required."); m->mothurOutEndLine();
251 }else { m->setAccnosFile(accnosfile); }
253 if (accnosfile2 == "not found") { accnosfile2 = ""; }
255 fastafile = validParameter.validFile(parameters, "fasta", true);
256 if (fastafile == "not open") { fastafile = ""; abort = true; }
257 else if (fastafile == "not found") { fastafile = ""; }
258 else { m->setFastaFile(fastafile); }
260 namefile = validParameter.validFile(parameters, "name", true);
261 if (namefile == "not open") { namefile = ""; abort = true; }
262 else if (namefile == "not found") { namefile = ""; }
263 else { m->setNameFile(namefile); }
265 groupfile = validParameter.validFile(parameters, "group", true);
266 if (groupfile == "not open") { abort = true; }
267 else if (groupfile == "not found") { groupfile = ""; }
268 else { m->setGroupFile(groupfile); }
270 alignfile = validParameter.validFile(parameters, "alignreport", true);
271 if (alignfile == "not open") { abort = true; }
272 else if (alignfile == "not found") { alignfile = ""; }
274 listfile = validParameter.validFile(parameters, "list", true);
275 if (listfile == "not open") { abort = true; }
276 else if (listfile == "not found") { listfile = ""; }
277 else { m->setListFile(listfile); }
279 taxfile = validParameter.validFile(parameters, "taxonomy", true);
280 if (taxfile == "not open") { taxfile = ""; abort = true; }
281 else if (taxfile == "not found") { taxfile = ""; }
282 else { m->setTaxonomyFile(taxfile); }
284 qualfile = validParameter.validFile(parameters, "qfile", true);
285 if (qualfile == "not open") { abort = true; }
286 else if (qualfile == "not found") { qualfile = ""; }
287 else { m->setQualFile(qualfile); }
289 fastqfile = validParameter.validFile(parameters, "fastq", true);
290 if (fastqfile == "not open") { abort = true; }
291 else if (fastqfile == "not found") { fastqfile = ""; }
293 accnosfile2 = validParameter.validFile(parameters, "accnos2", true);
294 if (accnosfile2 == "not open") { abort = true; }
295 else if (accnosfile2 == "not found") { accnosfile2 = ""; }
297 countfile = validParameter.validFile(parameters, "count", true);
298 if (countfile == "not open") { countfile = ""; abort = true; }
299 else if (countfile == "not found") { countfile = ""; }
300 else { m->setCountTableFile(countfile); }
302 if ((namefile != "") && (countfile != "")) {
303 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
306 if ((groupfile != "") && (countfile != "")) {
307 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
311 string usedDups = "true";
312 string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "true"; usedDups = ""; }
313 dups = m->isTrue(temp);
315 if ((fastqfile == "") && (fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "") && (countfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy, quality, fastq or listfile."); m->mothurOutEndLine(); abort = true; }
317 if (countfile == "") {
318 if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
319 vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
320 parser.getNameFile(files);
326 catch(exception& e) {
327 m->errorOut(e, "GetSeqsCommand", "GetSeqsCommand");
331 //**********************************************************************************************************************
333 int GetSeqsCommand::execute(){
336 if (abort == true) { if (calledHelp) { return 0; } return 2; }
338 //get names you want to keep
339 names = m->readAccnos(accnosfile);
341 if (m->control_pressed) { return 0; }
343 if (countfile != "") {
344 if ((fastafile != "") || (listfile != "") || (taxfile != "")) {
345 m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n");
349 //read through the correct file and output lines you want to keep
350 if (namefile != "") { readName(); }
351 if (fastafile != "") { readFasta(); }
352 if (fastqfile != "") { readFastq(); }
353 if (groupfile != "") { readGroup(); }
354 if (countfile != "") { readCount(); }
355 if (alignfile != "") { readAlign(); }
356 if (listfile != "") { readList(); }
357 if (taxfile != "") { readTax(); }
358 if (qualfile != "") { readQual(); }
359 if (accnosfile2 != "") { compareAccnos(); }
361 if (m->debug) { runSanityCheck(); }
363 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
366 if (outputNames.size() != 0) {
367 m->mothurOutEndLine();
368 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
369 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
370 m->mothurOutEndLine();
372 //set fasta file as new current fastafile
374 itTypes = outputTypes.find("fasta");
375 if (itTypes != outputTypes.end()) {
376 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
379 itTypes = outputTypes.find("name");
380 if (itTypes != outputTypes.end()) {
381 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
384 itTypes = outputTypes.find("group");
385 if (itTypes != outputTypes.end()) {
386 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
389 itTypes = outputTypes.find("list");
390 if (itTypes != outputTypes.end()) {
391 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
394 itTypes = outputTypes.find("taxonomy");
395 if (itTypes != outputTypes.end()) {
396 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
399 itTypes = outputTypes.find("qfile");
400 if (itTypes != outputTypes.end()) {
401 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
404 itTypes = outputTypes.find("count");
405 if (itTypes != outputTypes.end()) {
406 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
413 catch(exception& e) {
414 m->errorOut(e, "GetSeqsCommand", "execute");
418 //**********************************************************************************************************************
419 int GetSeqsCommand::readFastq(){
421 bool wroteSomething = false;
422 int selectedCount = 0;
425 m->openInputFile(fastqfile, in);
427 string thisOutputDir = outputDir;
428 if (outputDir == "") { thisOutputDir += m->hasPath(fastqfile); }
429 map<string, string> variables;
430 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastqfile));
431 variables["[extension]"] = m->getExtension(fastqfile);
432 string outputFileName = getOutputFileName("fastq", variables);
434 m->openOutputFile(outputFileName, out);
439 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
442 string input = m->getline(in); m->gobble(in);
444 string outputString = input + "\n";
446 if (input[0] == '@') {
448 outputString += m->getline(in) + "\n"; m->gobble(in);
449 outputString += m->getline(in) + "\n"; m->gobble(in);
450 outputString += m->getline(in) + "\n"; m->gobble(in);
452 vector<string> splits = m->splitWhiteSpace(input);
453 string name = splits[0];
454 name = name.substr(1);
457 if (names.count(name) != 0) {
458 wroteSomething = true;
470 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); }
471 outputNames.push_back(outputFileName); outputTypes["fastq"].push_back(outputFileName);
473 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your fastq file."); m->mothurOutEndLine();
478 catch(exception& e) {
479 m->errorOut(e, "GetSeqsCommand", "readFastq");
484 //**********************************************************************************************************************
485 int GetSeqsCommand::readFasta(){
487 string thisOutputDir = outputDir;
488 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
489 map<string, string> variables;
490 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
491 variables["[extension]"] = m->getExtension(fastafile);
492 string outputFileName = getOutputFileName("fasta", variables);
494 m->openOutputFile(outputFileName, out);
498 m->openInputFile(fastafile, in);
501 bool wroteSomething = false;
502 int selectedCount = 0;
504 if (m->debug) { set<string> temp; sanity["fasta"] = temp; }
508 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
510 Sequence currSeq(in);
511 name = currSeq.getName();
513 if (!dups) {//adjust name if needed
514 map<string, string>::iterator it = uniqueMap.find(name);
515 if (it != uniqueMap.end()) { currSeq.setName(it->second); }
518 name = currSeq.getName();
521 //if this name is in the accnos file
522 if (names.count(name) != 0) {
523 wroteSomething = true;
525 currSeq.printSequence(out);
528 if (m->debug) { sanity["fasta"].insert(name); }
537 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); }
538 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
540 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your fasta file."); m->mothurOutEndLine();
545 catch(exception& e) {
546 m->errorOut(e, "GetSeqsCommand", "readFasta");
550 //**********************************************************************************************************************
551 int GetSeqsCommand::readQual(){
553 string thisOutputDir = outputDir;
554 if (outputDir == "") { thisOutputDir += m->hasPath(qualfile); }
555 map<string, string> variables;
556 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(qualfile));
557 variables["[extension]"] = m->getExtension(qualfile);
558 string outputFileName = getOutputFileName("qfile", variables);
560 m->openOutputFile(outputFileName, out);
564 m->openInputFile(qualfile, in);
567 bool wroteSomething = false;
568 int selectedCount = 0;
570 if (m->debug) { set<string> temp; sanity["qual"] = temp; }
573 string saveName = "";
579 if (!dups) {//adjust name if needed
580 map<string, string>::iterator it = uniqueMap.find(name);
581 if (it != uniqueMap.end()) { name = it->second; }
584 if (name.length() != 0) {
585 saveName = name.substr(1);
588 if (c == 10 || c == 13 || c == -1){ break; }
595 char letter= in.get();
596 if(letter == '>'){ in.putback(letter); break; }
597 else{ scores += letter; }
602 if (names.count(saveName) != 0) {
603 wroteSomething = true;
605 out << name << endl << scores;
607 if (m->debug) { sanity["qual"].insert(name); }
616 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); }
617 outputNames.push_back(outputFileName); outputTypes["qfile"].push_back(outputFileName);
619 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your quality file."); m->mothurOutEndLine();
625 catch(exception& e) {
626 m->errorOut(e, "GetSeqsCommand", "readQual");
630 //**********************************************************************************************************************
631 int GetSeqsCommand::readCount(){
633 string thisOutputDir = outputDir;
634 if (outputDir == "") { thisOutputDir += m->hasPath(countfile); }
635 map<string, string> variables;
636 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(countfile));
637 variables["[extension]"] = m->getExtension(countfile);
638 string outputFileName = getOutputFileName("count", variables);
641 m->openOutputFile(outputFileName, out);
644 m->openInputFile(countfile, in);
646 bool wroteSomething = false;
647 int selectedCount = 0;
649 string headers = m->getline(in); m->gobble(in);
650 out << headers << endl;
652 string name, rest; int thisTotal;
655 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
657 in >> name; m->gobble(in);
658 in >> thisTotal; m->gobble(in);
659 rest = m->getline(in); m->gobble(in);
660 if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); }
662 if (names.count(name) != 0) {
663 out << name << '\t' << thisTotal << '\t' << rest << endl;
664 wroteSomething = true;
665 selectedCount+= thisTotal;
671 //check for groups that have been eliminated
673 if (ct.testGroups(outputFileName)) {
674 ct.readTable(outputFileName, true, false);
675 ct.printTable(outputFileName);
678 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); }
679 outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName);
681 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your count file."); m->mothurOutEndLine();
685 catch(exception& e) {
686 m->errorOut(e, "GetSeqsCommand", "readCount");
691 //**********************************************************************************************************************
692 int GetSeqsCommand::readList(){
694 string thisOutputDir = outputDir;
695 if (outputDir == "") { thisOutputDir += m->hasPath(listfile); }
696 map<string, string> variables;
697 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(listfile));
698 variables["[extension]"] = m->getExtension(listfile);
701 m->openInputFile(listfile, in);
703 bool wroteSomething = false;
704 int selectedCount = 0;
706 if (m->debug) { set<string> temp; sanity["list"] = temp; }
712 //read in list vector
715 //make a new list vector
717 newList.setLabel(list.getLabel());
719 variables["[distance]"] = list.getLabel();
720 string outputFileName = getOutputFileName("list", variables);
723 m->openOutputFile(outputFileName, out);
724 outputTypes["list"].push_back(outputFileName); outputNames.push_back(outputFileName);
726 vector<string> binLabels = list.getLabels();
727 vector<string> newBinLabels;
729 if (m->control_pressed) { in.close(); out.close(); return 0; }
732 for (int i = 0; i < list.getNumBins(); i++) {
734 //parse out names that are in accnos file
735 string binnames = list.get(i);
736 vector<string> bnames;
737 m->splitAtComma(binnames, bnames);
739 string newNames = "";
740 for (int j = 0; j < bnames.size(); j++) {
741 string name = bnames[j];
742 //if that name is in the .accnos file, add it
743 if (names.count(name) != 0) { newNames += name + ","; selectedCount++; if (m->debug) { sanity["list"].insert(name); } }
746 //if there are names in this bin add to new list
747 if (newNames != "") {
748 newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma
749 newList.push_back(newNames);
750 newBinLabels.push_back(binLabels[i]);
754 //print new listvector
755 if (newList.getNumBins() != 0) {
756 wroteSomething = true;
757 newList.setLabels(newBinLabels);
758 newList.printHeaders(out);
768 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); }
770 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your list file."); m->mothurOutEndLine();
775 catch(exception& e) {
776 m->errorOut(e, "GetSeqsCommand", "readList");
780 //**********************************************************************************************************************
781 int GetSeqsCommand::readName(){
783 string thisOutputDir = outputDir;
784 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
785 map<string, string> variables;
786 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile));
787 variables["[extension]"] = m->getExtension(namefile);
788 string outputFileName = getOutputFileName("name", variables);
790 m->openOutputFile(outputFileName, out);
794 m->openInputFile(namefile, in);
795 string name, firstCol, secondCol;
797 bool wroteSomething = false;
798 int selectedCount = 0;
800 if (m->debug) { set<string> temp; sanity["name"] = temp; }
801 if (m->debug) { set<string> temp; sanity["dupname"] = temp; }
805 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
807 in >> firstCol; m->gobble(in);
811 if (dups) { hold = secondCol; }
813 vector<string> parsedNames;
814 m->splitAtComma(secondCol, parsedNames);
816 vector<string> validSecond;
817 for (int i = 0; i < parsedNames.size(); i++) {
818 if (names.count(parsedNames[i]) != 0) {
819 validSecond.push_back(parsedNames[i]);
820 if (m->debug) { sanity["dupname"].insert(parsedNames[i]); }
824 if ((dups) && (validSecond.size() != 0)) { //dups = true and we want to add someone, then add everyone
825 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); if (m->debug) { sanity["dupname"].insert(parsedNames[i]); } }
826 out << firstCol << '\t' << hold << endl;
827 wroteSomething = true;
828 selectedCount += parsedNames.size();
829 if (m->debug) { sanity["name"].insert(firstCol); }
832 selectedCount += validSecond.size();
834 //if the name in the first column is in the set then print it and any other names in second column also in set
835 if (names.count(firstCol) != 0) {
837 wroteSomething = true;
839 out << firstCol << '\t';
841 //you know you have at least one valid second since first column is valid
842 for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
843 out << validSecond[validSecond.size()-1] << endl;
845 if (m->debug) { sanity["name"].insert(firstCol); }
848 //make first name in set you come to first column and then add the remaining names to second column
851 //you want part of this row
852 if (validSecond.size() != 0) {
854 wroteSomething = true;
856 out << validSecond[0] << '\t';
857 //we are changing the unique name in the fasta file
858 uniqueMap[firstCol] = validSecond[0];
860 //you know you have at least one valid second since first column is valid
861 for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
862 out << validSecond[validSecond.size()-1] << endl;
864 if (m->debug) { sanity["name"].insert(validSecond[0]); }
873 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); }
874 outputNames.push_back(outputFileName); outputTypes["name"].push_back(outputFileName);
876 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your name file."); m->mothurOutEndLine();
881 catch(exception& e) {
882 m->errorOut(e, "GetSeqsCommand", "readName");
887 //**********************************************************************************************************************
888 int GetSeqsCommand::readGroup(){
890 string thisOutputDir = outputDir;
891 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
892 map<string, string> variables;
893 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
894 variables["[extension]"] = m->getExtension(groupfile);
895 string outputFileName = getOutputFileName("group", variables);
897 m->openOutputFile(outputFileName, out);
901 m->openInputFile(groupfile, in);
904 bool wroteSomething = false;
905 int selectedCount = 0;
907 if (m->debug) { set<string> temp; sanity["group"] = temp; }
911 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
914 in >> name; //read from first column
915 in >> group; //read from second column
918 //if this name is in the accnos file
919 if (names.count(name) != 0) {
920 wroteSomething = true;
922 out << name << '\t' << group << endl;
925 if (m->debug) { sanity["group"].insert(name); }
933 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); }
934 outputNames.push_back(outputFileName); outputTypes["group"].push_back(outputFileName);
936 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your group file."); m->mothurOutEndLine();
942 catch(exception& e) {
943 m->errorOut(e, "GetSeqsCommand", "readGroup");
947 //**********************************************************************************************************************
948 int GetSeqsCommand::readTax(){
950 string thisOutputDir = outputDir;
951 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
952 map<string, string> variables;
953 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile));
954 variables["[extension]"] = m->getExtension(taxfile);
955 string outputFileName = getOutputFileName("taxonomy", variables);
957 m->openOutputFile(outputFileName, out);
960 m->openInputFile(taxfile, in);
963 bool wroteSomething = false;
964 int selectedCount = 0;
966 if (m->debug) { set<string> temp; sanity["tax"] = temp; }
970 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
972 in >> name; //read from first column
973 in >> tax; //read from second column
975 if (!dups) {//adjust name if needed
976 map<string, string>::iterator it = uniqueMap.find(name);
977 if (it != uniqueMap.end()) { name = it->second; }
980 //if this name is in the accnos file
981 if (names.count(name) != 0) {
982 wroteSomething = true;
984 out << name << '\t' << tax << endl;
987 if (m->debug) { sanity["tax"].insert(name); }
995 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); }
996 outputNames.push_back(outputFileName); outputTypes["taxonomy"].push_back(outputFileName);
998 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1003 catch(exception& e) {
1004 m->errorOut(e, "GetSeqsCommand", "readTax");
1008 //**********************************************************************************************************************
1009 //alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name
1010 int GetSeqsCommand::readAlign(){
1012 string thisOutputDir = outputDir;
1013 if (outputDir == "") { thisOutputDir += m->hasPath(alignfile); }
1014 map<string, string> variables;
1015 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(alignfile));
1016 string outputFileName = getOutputFileName("alignreport", variables);
1018 m->openOutputFile(outputFileName, out);
1022 m->openInputFile(alignfile, in);
1025 bool wroteSomething = false;
1026 int selectedCount = 0;
1028 //read column headers
1029 for (int i = 0; i < 16; i++) {
1030 if (!in.eof()) { in >> junk; out << junk << '\t'; }
1037 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1040 in >> name; //read from first column
1042 if (!dups) {//adjust name if needed
1043 map<string, string>::iterator it = uniqueMap.find(name);
1044 if (it != uniqueMap.end()) { name = it->second; }
1047 //if this name is in the accnos file
1048 if (names.count(name) != 0) {
1049 wroteSomething = true;
1052 out << name << '\t';
1055 for (int i = 0; i < 15; i++) {
1056 if (!in.eof()) { in >> junk; out << junk << '\t'; }
1061 }else {//still read just don't do anything with it
1063 for (int i = 0; i < 15; i++) {
1064 if (!in.eof()) { in >> junk; }
1074 if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); }
1075 outputNames.push_back(outputFileName); outputTypes["alignreport"].push_back(outputFileName);
1077 m->mothurOut("Selected " + toString(selectedCount) + " sequences from your alignreport file."); m->mothurOutEndLine();
1082 catch(exception& e) {
1083 m->errorOut(e, "GetSeqsCommand", "readAlign");
1087 //**********************************************************************************************************************
1088 //just looking at common mistakes.
1089 int GetSeqsCommand::runSanityCheck(){
1091 string thisOutputDir = outputDir;
1092 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
1093 string filename = outputDir + "get.seqs.debug.report";
1096 m->openOutputFile(filename, out);
1099 //compare fasta, name, qual and taxonomy if given to make sure they contain the same seqs
1100 if (fastafile != "") {
1101 if (namefile != "") { //compare with fasta
1102 if (sanity["fasta"] != sanity["name"]) { //create mismatch file
1103 createMisMatchFile(out, fastafile, namefile, sanity["fasta"], sanity["name"]);
1106 if (qualfile != "") {
1107 if (sanity["fasta"] != sanity["qual"]) { //create mismatch file
1108 createMisMatchFile(out, fastafile, qualfile, sanity["fasta"], sanity["qual"]);
1111 if (taxfile != "") {
1112 if (sanity["fasta"] != sanity["tax"]) { //create mismatch file
1113 createMisMatchFile(out, fastafile, taxfile, sanity["fasta"], sanity["tax"]);
1118 //compare dupnames, groups and list if given to make sure they match
1119 if (namefile != "") {
1120 if (groupfile != "") {
1121 if (sanity["dupname"] != sanity["group"]) { //create mismatch file
1122 createMisMatchFile(out, namefile, groupfile, sanity["dupname"], sanity["group"]);
1125 if (listfile != "") {
1126 if (sanity["dupname"] != sanity["list"]) { //create mismatch file
1127 createMisMatchFile(out, namefile, listfile, sanity["dupname"], sanity["list"]);
1132 if ((groupfile != "") && (fastafile != "")) {
1133 if (sanity["fasta"] != sanity["group"]) { //create mismatch file
1134 createMisMatchFile(out, fastafile, groupfile, sanity["fasta"], sanity["group"]);
1141 if (m->isBlank(filename)) { m->mothurRemove(filename); }
1142 else { m->mothurOut("\n[DEBUG]: " + filename + " contains the file mismatches.\n");outputNames.push_back(filename); outputTypes["debug"].push_back(filename); }
1146 catch(exception& e) {
1147 m->errorOut(e, "GetSeqsCommand", "runSanityCheck");
1151 //**********************************************************************************************************************
1152 //just looking at common mistakes.
1153 int GetSeqsCommand::createMisMatchFile(ofstream& out, string filename1, string filename2, set<string> set1, set<string> set2){
1155 out << "****************************************" << endl << endl;
1156 out << "Names unique to " << filename1 << ":\n";
1158 //remove names in set1 that are also in set2
1159 for (set<string>::iterator it = set1.begin(); it != set1.end();) {
1162 if (set2.count(name) == 0) { out << name << endl; } //name unique to set1
1163 else { set2.erase(name); } //you are in both so erase
1167 out << "\nNames unique to " << filename2 << ":\n";
1169 for (set<string>::iterator it = set2.begin(); it != set2.end(); it++) { out << *it << endl; }
1171 out << "****************************************" << endl << endl;
1175 catch(exception& e) {
1176 m->errorOut(e, "GetSeqsCommand", "runSanityCheck");
1180 //**********************************************************************************************************************
1182 int GetSeqsCommand::compareAccnos(){
1185 string thisOutputDir = outputDir;
1186 if (outputDir == "") { thisOutputDir += m->hasPath(accnosfile); }
1187 map<string, string> variables;
1188 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(accnosfile));
1189 string outputFileName = getOutputFileName("accnosreport", variables);
1192 m->openOutputFile(outputFileName, out);
1195 m->openInputFile(accnosfile2, in);
1198 set<string> namesAccnos2;
1199 set<string> namesDups;
1200 set<string> namesAccnos = names;
1202 map<string, int> nameCount;
1204 if (namefile != "") {
1206 m->openInputFile(namefile, inName);
1209 while(!inName.eof()){
1211 if (m->control_pressed) { inName.close(); return 0; }
1213 string thisname, repnames;
1215 inName >> thisname; m->gobble(inName); //read from first column
1216 inName >> repnames; //read from second column
1218 int num = m->getNumNames(repnames);
1219 nameCount[thisname] = num;
1229 if (namesAccnos.count(name) == 0){ //name unique to accnos2
1230 int pos = name.find_last_of('_');
1231 string tempName = name;
1232 if (pos != string::npos) { tempName = tempName.substr(pos+1); cout << tempName << endl; }
1233 if (namesAccnos.count(tempName) == 0){
1234 namesAccnos2.insert(name);
1235 }else { //you are in both so erase
1236 namesAccnos.erase(name);
1237 namesDups.insert(name);
1239 }else { //you are in both so erase
1240 namesAccnos.erase(name);
1241 namesDups.insert(name);
1248 out << "Names in both files : " + toString(namesDups.size()) << endl;
1249 m->mothurOut("Names in both files : " + toString(namesDups.size())); m->mothurOutEndLine();
1251 for (set<string>::iterator it = namesDups.begin(); it != namesDups.end(); it++) {
1253 if (namefile != "") { out << '\t' << nameCount[(*it)]; }
1257 out << "Names unique to " + accnosfile + " : " + toString(namesAccnos.size()) << endl;
1258 m->mothurOut("Names unique to " + accnosfile + " : " + toString(namesAccnos.size())); m->mothurOutEndLine();
1260 for (set<string>::iterator it = namesAccnos.begin(); it != namesAccnos.end(); it++) {
1262 if (namefile != "") { out << '\t' << nameCount[(*it)]; }
1266 out << "Names unique to " + accnosfile2 + " : " + toString(namesAccnos2.size()) << endl;
1267 m->mothurOut("Names unique to " + accnosfile2 + " : " + toString(namesAccnos2.size())); m->mothurOutEndLine();
1269 for (set<string>::iterator it = namesAccnos2.begin(); it != namesAccnos2.end(); it++) {
1271 if (namefile != "") { out << '\t' << nameCount[(*it)]; }
1277 outputNames.push_back(outputFileName); outputTypes["accnosreport"].push_back(outputFileName);
1282 catch(exception& e) {
1283 m->errorOut(e, "GetSeqsCommand", "compareAccnos");
1289 //**********************************************************************************************************************