]> git.donarmstrong.com Git - mothur.git/blob - getlineagecommand.cpp
classify.seqs allows sequences to be in taxonomy file that are not in template. ...
[mothur.git] / getlineagecommand.cpp
1 /*
2  *  getlineagecommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/24/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "getlineagecommand.h"
11 #include "sequence.hpp"
12 #include "listvector.hpp"
13 #include "counttable.h"
14
15 //**********************************************************************************************************************
16 vector<string> GetLineageCommand::setParameters(){      
17         try {
18                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
19         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname);
20         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount);
21                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup);
22                 CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist);
23                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,true); parameters.push_back(ptaxonomy);
24                 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport);
25                 CommandParameter ptaxon("taxon", "String", "", "", "", "", "",false,true); parameters.push_back(ptaxon);
26                 CommandParameter pdups("dups", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pdups);
27                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29                 
30                 vector<string> myArray;
31                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
32                 return myArray;
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "GetLineageCommand", "setParameters");
36                 exit(1);
37         }
38 }
39 //**********************************************************************************************************************
40 string GetLineageCommand::getHelpString(){      
41         try {
42                 string helpString = "";
43                 helpString += "The get.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, count, list or alignreport file.\n";
44                 helpString += "It outputs a file containing only the sequences from the taxonomy file that are from the taxon requested.\n";
45                 helpString += "The get.lineage command parameters are taxon, fasta, name, group, count, list, taxonomy, alignreport and dups.  You must provide taxonomy unless you have a valid current taxonomy file.\n";
46                 helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n";
47                 helpString += "The taxon parameter allows you to select the taxons you would like to get and is required.\n";
48                 helpString += "You may enter your taxons with confidence scores, doing so will get only those sequences that belong to the taxonomy and whose cofidence scores is above the scores you give.\n";
49                 helpString += "If they belong to the taxonomy and have confidences below those you provide the sequence will not be selected.\n";
50                 helpString += "The get.lineage command should be in the following format: get.lineage(taxonomy=yourTaxonomyFile, taxon=yourTaxons).\n";
51                 helpString += "Example get.lineage(taxonomy=amazon.silva.taxonomy, taxon=Bacteria;Firmicutes;Bacilli;Lactobacillales;).\n";
52                 helpString += "Note: If you are running mothur in script mode you must wrap the taxon in ' characters so mothur will ignore the ; in the taxon.\n";
53                 helpString += "Example get.lineage(taxonomy=amazon.silva.taxonomy, taxon='Bacteria;Firmicutes;Bacilli;Lactobacillales;').\n";
54                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
55                 return helpString;
56         }
57         catch(exception& e) {
58                 m->errorOut(e, "GetLineageCommand", "getHelpString");
59                 exit(1);
60         }
61 }
62 //**********************************************************************************************************************
63 string GetLineageCommand::getOutputFileNameTag(string type, string inputName=""){       
64         try {
65         string outputFileName = "";
66                 map<string, vector<string> >::iterator it;
67         
68         //is this a type this command creates
69         it = outputTypes.find(type);
70         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
71         else {
72             if (type == "fasta")            {   outputFileName =  "pick" + m->getExtension(inputName);   }
73             else if (type == "taxonomy")    {   outputFileName =  "pick" + m->getExtension(inputName);   }
74             else if (type == "name")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
75             else if (type == "count")       {   outputFileName =  "pick.count_table";                    }  
76             else if (type == "group")       {   outputFileName =  "pick" + m->getExtension(inputName);   }
77             else if (type == "list")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
78             else if (type == "alignreport") {   outputFileName =  "pick.align.report";   }
79             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
80         }
81         return outputFileName;
82         }
83         catch(exception& e) {
84                 m->errorOut(e, "GetLineageCommand", "getOutputFileNameTag");
85                 exit(1);
86         }
87 }
88 //**********************************************************************************************************************
89 GetLineageCommand::GetLineageCommand(){ 
90         try {
91                 abort = true; calledHelp = true; 
92                 setParameters();
93                 vector<string> tempOutNames;
94                 outputTypes["fasta"] = tempOutNames;
95                 outputTypes["taxonomy"] = tempOutNames;
96                 outputTypes["name"] = tempOutNames;
97                 outputTypes["group"] = tempOutNames;
98                 outputTypes["alignreport"] = tempOutNames;
99                 outputTypes["list"] = tempOutNames;
100         outputTypes["count"] = tempOutNames;
101         }
102         catch(exception& e) {
103                 m->errorOut(e, "GetLineageCommand", "GetLineageCommand");
104                 exit(1);
105         }
106 }
107 //**********************************************************************************************************************
108 GetLineageCommand::GetLineageCommand(string option)  {
109         try {
110                 abort = false; calledHelp = false;   
111                                 
112                 //allow user to run help
113                 if(option == "help") { help(); abort = true; calledHelp = true; }
114                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
115                 
116                 else {
117                         vector<string> myArray = setParameters();
118                         
119                         OptionParser parser(option);
120                         map<string,string> parameters = parser.getParameters();
121                         
122                         ValidParameters validParameter;
123                         map<string,string>::iterator it;
124                         
125                         //check to make sure all parameters are valid for command
126                         for (it = parameters.begin(); it != parameters.end(); it++) { 
127                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
128                         }
129                         
130                         //initialize outputTypes
131                         vector<string> tempOutNames;
132                         outputTypes["fasta"] = tempOutNames;
133                         outputTypes["taxonomy"] = tempOutNames;
134                         outputTypes["name"] = tempOutNames;
135                         outputTypes["group"] = tempOutNames;
136                         outputTypes["alignreport"] = tempOutNames;
137                         outputTypes["list"] = tempOutNames;
138             outputTypes["count"] = tempOutNames;
139
140                         //if the user changes the output directory command factory will send this info to us in the output parameter 
141                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
142                         
143                         //if the user changes the input directory command factory will send this info to us in the output parameter 
144                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
145                         if (inputDir == "not found"){   inputDir = "";          }
146                         else {
147                                 string path;
148                                 it = parameters.find("alignreport");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
154                                 }
155                                 
156                                 it = parameters.find("fasta");
157                                 //user has given a template file
158                                 if(it != parameters.end()){ 
159                                         path = m->hasPath(it->second);
160                                         //if the user has not given a path then, add inputdir. else leave path alone.
161                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
162                                 }
163                                 
164                                 it = parameters.find("list");
165                                 //user has given a template file
166                                 if(it != parameters.end()){ 
167                                         path = m->hasPath(it->second);
168                                         //if the user has not given a path then, add inputdir. else leave path alone.
169                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
170                                 }
171                                 
172                                 it = parameters.find("name");
173                                 //user has given a template file
174                                 if(it != parameters.end()){ 
175                                         path = m->hasPath(it->second);
176                                         //if the user has not given a path then, add inputdir. else leave path alone.
177                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
178                                 }
179                                 
180                                 it = parameters.find("group");
181                                 //user has given a template file
182                                 if(it != parameters.end()){ 
183                                         path = m->hasPath(it->second);
184                                         //if the user has not given a path then, add inputdir. else leave path alone.
185                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
186                                 }
187                                 
188                                 it = parameters.find("taxonomy");
189                                 //user has given a template file
190                                 if(it != parameters.end()){ 
191                                         path = m->hasPath(it->second);
192                                         //if the user has not given a path then, add inputdir. else leave path alone.
193                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
194                                 }
195                 
196                 it = parameters.find("count");
197                                 //user has given a template file
198                                 if(it != parameters.end()){ 
199                                         path = m->hasPath(it->second);
200                                         //if the user has not given a path then, add inputdir. else leave path alone.
201                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
202                                 }
203                         }
204
205                         
206                         //check for required parameters                 
207                         fastafile = validParameter.validFile(parameters, "fasta", true);
208                         if (fastafile == "not open") { fastafile = ""; abort = true; }
209                         else if (fastafile == "not found") {  fastafile = "";  }
210                         else { m->setFastaFile(fastafile); }
211                         
212                         namefile = validParameter.validFile(parameters, "name", true);
213                         if (namefile == "not open") { namefile = ""; abort = true; }
214                         else if (namefile == "not found") {  namefile = "";  }  
215                         else { m->setNameFile(namefile); }
216                         
217                         groupfile = validParameter.validFile(parameters, "group", true);
218                         if (groupfile == "not open") { abort = true; }
219                         else if (groupfile == "not found") {  groupfile = "";  }        
220                         else { m->setGroupFile(groupfile); }
221                         
222                         alignfile = validParameter.validFile(parameters, "alignreport", true);
223                         if (alignfile == "not open") { abort = true; }
224                         else if (alignfile == "not found") {  alignfile = "";  }
225                         
226                         listfile = validParameter.validFile(parameters, "list", true);
227                         if (listfile == "not open") { abort = true; }
228                         else if (listfile == "not found") {  listfile = "";  }
229                         else { m->setListFile(listfile); }
230                         
231                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
232                         if (taxfile == "not open") { taxfile = ""; abort = true; }
233                         else if (taxfile == "not found") {                              
234                                 taxfile = m->getTaxonomyFile(); 
235                                 if (taxfile != "") { m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
236                                 else {  m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; }
237                         }else { m->setTaxonomyFile(taxfile); }
238                         
239                         string usedDups = "true";
240                         string temp = validParameter.validFile(parameters, "dups", false);      
241                         if (temp == "not found") { 
242                                 if (namefile != "") {  temp = "true";                                   }
243                                 else                            {  temp = "false"; usedDups = "";       }
244                         }
245                         dups = m->isTrue(temp);
246             
247             countfile = validParameter.validFile(parameters, "count", true);
248             if (countfile == "not open") { countfile = ""; abort = true; }
249             else if (countfile == "not found") { countfile = "";  }     
250             else { m->setCountTableFile(countfile); }
251             
252             if ((namefile != "") && (countfile != "")) {
253                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
254             }
255             
256             if ((groupfile != "") && (countfile != "")) {
257                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
258             }
259                         
260                         taxons = validParameter.validFile(parameters, "taxon", false);  
261                         if (taxons == "not found") { taxons = "";  m->mothurOut("No taxons given, please correct."); m->mothurOutEndLine();  abort = true;  }
262                         else { 
263                                 //rip off quotes
264                                 if (taxons[0] == '\'') {  taxons = taxons.substr(1); }
265                                 if (taxons[(taxons.length()-1)] == '\'') {  taxons = taxons.substr(0, (taxons.length()-1)); }
266                         }
267                         m->splitAtChar(taxons, listOfTaxons, '-');
268                         
269                         if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (countfile == ""))  { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; }
270                 
271             if (countfile == "") {
272                 if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
273                     vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
274                     parser.getNameFile(files);
275                 }
276             }
277                 }
278
279         }
280         catch(exception& e) {
281                 m->errorOut(e, "GetLineageCommand", "GetLineageCommand");
282                 exit(1);
283         }
284 }
285 //**********************************************************************************************************************
286
287 int GetLineageCommand::execute(){
288         try {
289                 
290                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
291                 
292                 if (m->control_pressed) { return 0; }
293         
294         if (countfile != "") {
295             if ((fastafile != "") || (listfile != "") || (taxfile != "")) { 
296                 m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n");
297             }
298         }
299                 
300                 //read through the correct file and output lines you want to keep
301                 if (taxfile != "")                      {               readTax();              }  //fills the set of names to get
302                 if (namefile != "")                     {               readName();             }
303                 if (fastafile != "")            {               readFasta();    }
304         if (countfile != "")            {               readCount();    }
305                 if (groupfile != "")            {               readGroup();    }
306                 if (alignfile != "")            {               readAlign();    }
307                 if (listfile != "")                     {               readList();             }
308                 
309                 
310                 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0; }
311                 
312                 if (outputNames.size() != 0) {
313                         m->mothurOutEndLine();
314                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
315                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
316                         m->mothurOutEndLine();
317                         
318                         //set fasta file as new current fastafile
319                         string current = "";
320                         itTypes = outputTypes.find("fasta");
321                         if (itTypes != outputTypes.end()) {
322                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
323                         }
324                         
325                         itTypes = outputTypes.find("name");
326                         if (itTypes != outputTypes.end()) {
327                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
328                         }
329                         
330                         itTypes = outputTypes.find("group");
331                         if (itTypes != outputTypes.end()) {
332                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
333                         }
334                         
335                         itTypes = outputTypes.find("list");
336                         if (itTypes != outputTypes.end()) {
337                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
338                         }
339                         
340                         itTypes = outputTypes.find("taxonomy");
341                         if (itTypes != outputTypes.end()) {
342                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
343                         }
344                         
345             itTypes = outputTypes.find("count");
346                         if (itTypes != outputTypes.end()) {
347                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
348                         }
349                 }
350                 
351                 return 0;               
352         }
353
354         catch(exception& e) {
355                 m->errorOut(e, "GetLineageCommand", "execute");
356                 exit(1);
357         }
358 }
359
360 //**********************************************************************************************************************
361 int GetLineageCommand::readFasta(){
362         try {
363                 string thisOutputDir = outputDir;
364                 if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
365                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta", fastafile);
366                 ofstream out;
367                 m->openOutputFile(outputFileName, out);
368                 
369                 
370                 ifstream in;
371                 m->openInputFile(fastafile, in);
372                 string name;
373                 
374                 bool wroteSomething = false;
375                 
376                 while(!in.eof()){
377                 
378                         if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName);  return 0; }
379                         
380                         Sequence currSeq(in);
381                         name = currSeq.getName();
382                         
383                         if (name != "") {
384                                 //if this name is in the accnos file
385                                 if (names.count(name) != 0) {
386                                         wroteSomething = true;
387                                         
388                                         currSeq.printSequence(out);
389                                 }
390                         }
391                         m->gobble(in);
392                 }
393                 in.close();     
394                 out.close();
395                 
396                 if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
397                 outputNames.push_back(outputFileName);  outputTypes["fasta"].push_back(outputFileName);
398                 
399                 return 0;
400
401         }
402         catch(exception& e) {
403                 m->errorOut(e, "GetLineageCommand", "readFasta");
404                 exit(1);
405         }
406 }
407 //**********************************************************************************************************************
408 int GetLineageCommand::readCount(){
409         try {
410                 string thisOutputDir = outputDir;
411                 if (outputDir == "") {  thisOutputDir += m->hasPath(countfile);  }
412                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
413                 
414                 ofstream out;
415                 m->openOutputFile(outputFileName, out);
416                 
417                 ifstream in;
418                 m->openInputFile(countfile, in);
419                 
420                 bool wroteSomething = false;
421                 
422         string headers = m->getline(in); m->gobble(in);
423         out << headers << endl;
424         
425         string name, rest; int thisTotal;
426         while (!in.eof()) {
427             
428             if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
429             
430             in >> name; m->gobble(in); 
431             in >> thisTotal; m->gobble(in);
432             rest = m->getline(in); m->gobble(in);
433             if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); }
434             
435             if (names.count(name) != 0) {
436                 out << name << '\t' << thisTotal << '\t' << rest << endl;
437                 wroteSomething = true;
438             }
439         }
440         in.close();
441                 out.close();
442         
443         //check for groups that have been eliminated
444         CountTable ct;
445         if (ct.testGroups(outputFileName)) {
446             ct.readTable(outputFileName);
447             ct.printTable(outputFileName);
448         }
449
450                 
451                 if (wroteSomething == false) {  m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
452                 outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName);
453                        
454                 return 0;
455         }
456         catch(exception& e) {
457                 m->errorOut(e, "GetLineageCommand", "readCount");
458                 exit(1);
459         }
460 }
461 //**********************************************************************************************************************
462 int GetLineageCommand::readList(){
463         try {
464                 string thisOutputDir = outputDir;
465                 if (outputDir == "") {  thisOutputDir += m->hasPath(listfile);  }
466                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(listfile)) + getOutputFileNameTag("list", listfile);
467                 ofstream out;
468                 m->openOutputFile(outputFileName, out);
469                 
470                 ifstream in;
471                 m->openInputFile(listfile, in);
472                 
473                 bool wroteSomething = false;
474                 
475                 while(!in.eof()){
476                         
477                         if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName);  return 0; }
478
479                         //read in list vector
480                         ListVector list(in);
481                         
482                         //make a new list vector
483                         ListVector newList;
484                         newList.setLabel(list.getLabel());
485                         
486                         //for each bin
487                         for (int i = 0; i < list.getNumBins(); i++) {
488                         
489                                 //parse out names that are in accnos file
490                                 string binnames = list.get(i);
491                                 
492                                 string newNames = "";
493                                 while (binnames.find_first_of(',') != -1) { 
494                                         string name = binnames.substr(0,binnames.find_first_of(','));
495                                         binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length());
496                                         
497                                         //if that name is in the .accnos file, add it
498                                         if (names.count(name) != 0) {  newNames += name + ",";  }
499                                 }
500                         
501                                 //get last name
502                                 if (names.count(binnames) != 0) {  newNames += binnames + ",";  }
503
504                                 //if there are names in this bin add to new list
505                                 if (newNames != "") { 
506                                         newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma
507                                         newList.push_back(newNames);    
508                                 }
509                         }
510                                 
511                         //print new listvector
512                         if (newList.getNumBins() != 0) {
513                                 wroteSomething = true;
514                                 newList.print(out);
515                         }
516                         
517                         m->gobble(in);
518                 }
519                 in.close();     
520                 out.close();
521                 
522                 if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
523                 outputNames.push_back(outputFileName); outputTypes["list"].push_back(outputFileName);
524                 
525                 return 0;
526
527         }
528         catch(exception& e) {
529                 m->errorOut(e, "GetLineageCommand", "readList");
530                 exit(1);
531         }
532 }
533 //**********************************************************************************************************************
534 int GetLineageCommand::readName(){
535         try {
536                 string thisOutputDir = outputDir;
537                 if (outputDir == "") {  thisOutputDir += m->hasPath(namefile);  }
538                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile);
539                 ofstream out;
540                 m->openOutputFile(outputFileName, out);
541                 
542
543                 ifstream in;
544                 m->openInputFile(namefile, in);
545                 string name, firstCol, secondCol;
546                 
547                 bool wroteSomething = false;
548                 
549                 
550                 while(!in.eof()){
551                 
552                         if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName);  return 0; }
553
554                         in >> firstCol;                         
555                         in >> secondCol;
556                         
557                         string hold = "";
558                         if (dups) { hold = secondCol; }
559                         
560                         vector<string> parsedNames;
561                         m->splitAtComma(secondCol, parsedNames);
562                         
563                         vector<string> validSecond;
564                         for (int i = 0; i < parsedNames.size(); i++) {
565                                 if (names.count(parsedNames[i]) != 0) {
566                                         validSecond.push_back(parsedNames[i]);
567                                 }
568                         }
569
570                         if ((dups) && (validSecond.size() != 0)) { //dups = true and we want to add someone, then add everyone
571                                 for (int i = 0; i < parsedNames.size(); i++) {  names.insert(parsedNames[i]);  }
572                                 out << firstCol << '\t' << hold << endl;
573                                 wroteSomething = true;
574                         }else {
575                                 //if the name in the first column is in the set then print it and any other names in second column also in set
576                                 if (names.count(firstCol) != 0) {
577                                 
578                                         wroteSomething = true;
579                                         
580                                         out << firstCol << '\t';
581                                         
582                                         //you know you have at least one valid second since first column is valid
583                                         for (int i = 0; i < validSecond.size()-1; i++) {  out << validSecond[i] << ',';  }
584                                         out << validSecond[validSecond.size()-1] << endl;
585                                         
586                                 
587                                 //make first name in set you come to first column and then add the remaining names to second column
588                                 }else {
589                                         //you want part of this row
590                                         if (validSecond.size() != 0) {
591                                         
592                                                 wroteSomething = true;
593                                                 
594                                                 out << validSecond[0] << '\t';
595                                         
596                                                 //you know you have at least one valid second since first column is valid
597                                                 for (int i = 0; i < validSecond.size()-1; i++) {  out << validSecond[i] << ',';  }
598                                                 out << validSecond[validSecond.size()-1] << endl;
599                                         }
600                                 }
601                         }
602                         m->gobble(in);
603                 }
604                 in.close();
605                 out.close();
606                 
607                 if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
608                 outputNames.push_back(outputFileName);  outputTypes["name"].push_back(outputFileName);
609                 
610                 return 0;
611                 
612         }
613         catch(exception& e) {
614                 m->errorOut(e, "GetLineageCommand", "readName");
615                 exit(1);
616         }
617 }
618
619 //**********************************************************************************************************************
620 int GetLineageCommand::readGroup(){
621         try {
622                 string thisOutputDir = outputDir;
623                 if (outputDir == "") {  thisOutputDir += m->hasPath(groupfile);  }
624                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);
625                 ofstream out;
626                 m->openOutputFile(outputFileName, out);
627                 
628
629                 ifstream in;
630                 m->openInputFile(groupfile, in);
631                 string name, group;
632                 
633                 bool wroteSomething = false;
634                 
635                 while(!in.eof()){
636
637                         if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName);  return 0; }
638
639
640                         in >> name;                             //read from first column
641                         in >> group;                    //read from second column
642                         
643                         //if this name is in the accnos file
644                         if (names.count(name) != 0) {
645                                 wroteSomething = true;
646                                 
647                                 out << name << '\t' << group << endl;
648                         }
649                                         
650                         m->gobble(in);
651                 }
652                 in.close();
653                 out.close();
654                 
655                 if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
656                 outputNames.push_back(outputFileName);  outputTypes["group"].push_back(outputFileName);
657                 
658                 return 0;
659
660         }
661         catch(exception& e) {
662                 m->errorOut(e, "GetLineageCommand", "readGroup");
663                 exit(1);
664         }
665 }
666 //**********************************************************************************************************************
667 int GetLineageCommand::readTax(){
668         try {
669                 string thisOutputDir = outputDir;
670                 if (outputDir == "") {  thisOutputDir += m->hasPath(taxfile);  }
671                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + getOutputFileNameTag("taxonomy", taxfile);
672                 ofstream out;
673                 m->openOutputFile(outputFileName, out);
674                 
675                 ifstream in;
676                 m->openInputFile(taxfile, in);
677                 string name, tax;
678                 
679                 //bool wroteSomething = false;
680                 vector<bool> taxonsHasConfidence; taxonsHasConfidence.resize(listOfTaxons.size(), false);
681                 vector< vector< map<string, float> > > searchTaxons; searchTaxons.resize(listOfTaxons.size());
682                 vector<string> noConfidenceTaxons; noConfidenceTaxons.resize(listOfTaxons.size(), "");
683                 
684                 for (int i = 0; i < listOfTaxons.size(); i++) {
685                         noConfidenceTaxons[i] = listOfTaxons[i];
686                         int hasConPos = listOfTaxons[i].find_first_of('(');
687                         if (hasConPos != string::npos) {  
688                                 taxonsHasConfidence[i] = true; 
689                                 searchTaxons[i] = getTaxons(listOfTaxons[i]); 
690                                 noConfidenceTaxons[i] = listOfTaxons[i];
691                                 m->removeConfidences(noConfidenceTaxons[i]);
692                         }
693                 }
694                 
695                 
696                 while(!in.eof()){
697
698                         if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName);  return 0; }
699
700                         in >> name;                             //read from first column
701                         in >> tax;                      //read from second column
702                         
703             string noQuotesTax = m->removeQuotes(tax);
704             
705                         for (int j = 0; j < listOfTaxons.size(); j++) {
706                                                         
707                                 string newtax = noQuotesTax;
708                         
709                                 //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them
710                                 if (!taxonsHasConfidence[j]) {
711                                         int hasConfidences = noQuotesTax.find_first_of('(');
712                                         if (hasConfidences != string::npos) { 
713                                                 newtax = noQuotesTax;
714                                                 m->removeConfidences(newtax);
715                                         }
716                                 
717                                         int pos = newtax.find(noConfidenceTaxons[j]);
718                                 
719                                         if (pos != string::npos) { //this sequence contains the taxon the user wants
720                                                 names.insert(name); 
721                                                 out << name << '\t' << tax << endl;
722                                                 //since you belong to at least one of the taxons we want you are included so no need to search for other
723                                                 break;
724                                         }
725                                 }else{//if listOfTaxons[i] has them and you don't them remove taxons
726                                         int hasConfidences = noQuotesTax.find_first_of('(');
727                                         if (hasConfidences == string::npos) { 
728                                         
729                                                 int pos = newtax.find(noConfidenceTaxons[j]);
730                                         
731                                                 if (pos != string::npos) { //this sequence contains the taxon the user wants
732                                                         names.insert(name);
733                                                         out << name << '\t' << tax << endl;
734                                                         //since you belong to at least one of the taxons we want you are included so no need to search for other
735                                                         break;
736                                                 }
737                                         }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons
738                                                 //first remove confidences from both and see if the taxonomy exists
739                                         
740                                                 string noNewTax = noQuotesTax;
741                                                 int hasConfidences = noQuotesTax.find_first_of('(');
742                                                 if (hasConfidences != string::npos) { 
743                                                         noNewTax = noQuotesTax;
744                                                         m->removeConfidences(noNewTax);
745                                                 }
746                                         
747                                                 int pos = noNewTax.find(noConfidenceTaxons[j]);
748                                         
749                                                 if (pos != string::npos) { //if yes, then are the confidences okay
750                                                 
751                                                         bool good = true;
752                                                         vector< map<string, float> > usersTaxon = getTaxons(newtax);
753                                                 
754                                                         //the usersTaxon is most likely longer than the searchTaxons, and searchTaxon[0] may relate to userTaxon[4]
755                                                         //we want to "line them up", so we will find the the index where the searchstring starts
756                                                         int index = 0;
757                                                         for (int i = 0; i < usersTaxon.size(); i++) {
758                                                         
759                                                                 if (usersTaxon[i].begin()->first == searchTaxons[j][0].begin()->first) { 
760                                                                         index = i;  
761                                                                         int spot = 0;
762                                                                         bool goodspot = true;
763                                                                         //is this really the start, or are we dealing with a taxon of the same name?
764                                                                         while ((spot < searchTaxons[j].size()) && ((i+spot) < usersTaxon.size())) {
765                                                                                 if (usersTaxon[i+spot].begin()->first != searchTaxons[j][spot].begin()->first) { goodspot = false; break; }
766                                                                                 else { spot++; }
767                                                                         }
768                                                                 
769                                                                         if (goodspot) { break; }
770                                                                 }
771                                                         }
772                                                 
773                                                         for (int i = 0; i < searchTaxons[j].size(); i++) {
774                                                         
775                                                                 if ((i+index) < usersTaxon.size()) { //just in case, should never be false
776                                                                         if (usersTaxon[i+index].begin()->second < searchTaxons[j][i].begin()->second) { //is the users cutoff less than the search taxons
777                                                                                 good = false;
778                                                                                 break;
779                                                                         }
780                                                                 }else {
781                                                                         good = false;
782                                                                         break;
783                                                                 }
784                                                         }
785                                                 
786                                                         //passed the test so add you
787                                                         if (good) {
788                                                                 names.insert(name);
789                                                                 out << name << '\t' << tax << endl;
790                                                                 break;
791                                                         }
792                                                 }
793                                         }
794                                 }
795                         
796                         }
797                         
798                         m->gobble(in);
799                 }
800                 in.close();
801                 out.close();
802                 
803                 if (names.size() == 0) { m->mothurOut("Your taxonomy file does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
804                 outputNames.push_back(outputFileName); outputTypes["taxonomy"].push_back(outputFileName);
805                         
806                 return 0;
807
808         }
809         catch(exception& e) {
810                 m->errorOut(e, "GetLineageCommand", "readTax");
811                 exit(1);
812         }
813 }
814 /**************************************************************************************************/
815 vector< map<string, float> > GetLineageCommand::getTaxons(string tax) {
816         try {
817         
818                 vector< map<string, float> > t;
819                 string taxon = "";
820                 int taxLength = tax.length();
821         
822                 for(int i=0;i<taxLength;i++){
823                         if(tax[i] == ';'){
824                 
825                                 int openParen = taxon.find_last_of('(');
826                                 int closeParen = taxon.find_last_of(')');
827                                 
828                                 string newtaxon, confidence;
829                                 if ((openParen != string::npos) && (closeParen != string::npos)) {
830                     string confidenceScore = taxon.substr(openParen+1, (closeParen-(openParen+1)));
831                     if (m->isNumeric1(confidenceScore)) {  //its a confidence
832                         newtaxon = taxon.substr(0, openParen); //rip off confidence
833                         confidence = taxon.substr((openParen+1), (closeParen-openParen-1));  
834                     }else { //its part of the taxon
835                         newtaxon = taxon;
836                         confidence = "0";
837                     }
838                                 }else{
839                                         newtaxon = taxon;
840                                         confidence = "0";
841                                 } 
842                                 float con = 0;
843                                 convert(confidence, con);
844                                 
845                                 map<string, float> temp;
846                                 temp[newtaxon] = con;
847                 
848                                 t.push_back(temp);
849                                 taxon = "";
850                         }
851                         else{
852                                 taxon += tax[i];
853                 
854                         }
855                 }
856                 
857                 return t;
858         }
859         catch(exception& e) {
860                 m->errorOut(e, "GetLineageCommand", "getTaxons");
861                 exit(1);
862         }
863 }
864 //**********************************************************************************************************************
865 //alignreport file has a column header line then all other lines contain 16 columns.  we just want the first column since that contains the name
866 int GetLineageCommand::readAlign(){
867         try {
868                 string thisOutputDir = outputDir;
869                 if (outputDir == "") {  thisOutputDir += m->hasPath(alignfile);  }
870                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(alignfile)) + getOutputFileNameTag("alignreport");
871                 ofstream out;
872                 m->openOutputFile(outputFileName, out);
873                 
874
875                 ifstream in;
876                 m->openInputFile(alignfile, in);
877                 string name, junk;
878                 
879                 bool wroteSomething = false;
880                 
881                 //read column headers
882                 for (int i = 0; i < 16; i++) {  
883                         if (!in.eof())  {       in >> junk;      out << junk << '\t';   }
884                         else                    {       break;                  }
885                 }
886                 out << endl;
887                 
888                 while(!in.eof()){
889                 
890                         if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName);  return 0; }
891
892
893                         in >> name;                             //read from first column
894                         
895                         //if this name is in the accnos file
896                         if (names.count(name) != 0) {
897                                 wroteSomething = true;
898                                 
899                                 out << name << '\t';
900                                 
901                                 //read rest
902                                 for (int i = 0; i < 15; i++) {  
903                                         if (!in.eof())  {       in >> junk;      out << junk << '\t';   }
904                                         else                    {       break;                  }
905                                 }
906                                 out << endl;
907                                 
908                         }else {//still read just don't do anything with it
909                                 //read rest
910                                 for (int i = 0; i < 15; i++) {  
911                                         if (!in.eof())  {       in >> junk;             }
912                                         else                    {       break;                  }
913                                 }
914                         }
915                         
916                         m->gobble(in);
917                 }
918                 in.close();
919                 out.close();
920                 
921                 if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
922                 outputNames.push_back(outputFileName); outputTypes["alignreport"].push_back(outputFileName);
923                 
924                 return 0;
925                 
926         }
927         catch(exception& e) {
928                 m->errorOut(e, "GetLineageCommand", "readAlign");
929                 exit(1);
930         }
931 }
932 //**********************************************************************************************************************
933
934
935
936