]> git.donarmstrong.com Git - mothur.git/blob - sortseqscommand.cpp
changed sffinfo flow default to true. fixed bug in trim.seqs and filter.seqs related...
[mothur.git] / sortseqscommand.cpp
1 //
2 //  sortseqscommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 2/3/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "sortseqscommand.h"
10 #include "sequence.hpp"
11 #include "qualityscores.h"
12
13 //**********************************************************************************************************************
14 vector<string> SortSeqsCommand::setParameters(){        
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
17         CommandParameter pflow("flow", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pflow);
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup);
20                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy);
21                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pqfile);
22                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
23                 CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos);
24         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "SortSeqsCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string SortSeqsCommand::getHelpString(){        
38         try {
39                 string helpString = "";
40                 helpString += "The sort.seqs command puts the sequences in the same order for the following file types: accnos fasta, name, group, taxonomy, flow or quality file.\n";
41         helpString += "The sort.seqs command parameters are accnos, fasta, name, group, taxonomy, flow, qfile and large.\n";
42         helpString += "The accnos file allows you to specify the order you want the files in.  If none is provided, mothur will use the order of the first file it reads.\n";
43         helpString += "The large parameters is used to indicate your files are too large to fit in RAM.\n";
44                 helpString += "The sort.seqs command should be in the following format: sort.seqs(fasta=yourFasta).\n";
45                 helpString += "Example sort.seqs(fasta=amazon.fasta).\n";
46                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
47                 return helpString;
48         }
49         catch(exception& e) {
50                 m->errorOut(e, "SortSeqsCommand", "getHelpString");
51                 exit(1);
52         }
53 }
54
55
56 //**********************************************************************************************************************
57 SortSeqsCommand::SortSeqsCommand(){     
58         try {
59                 abort = true; calledHelp = true; 
60                 setParameters();
61                 vector<string> tempOutNames;
62                 outputTypes["fasta"] = tempOutNames;
63                 outputTypes["taxonomy"] = tempOutNames;
64                 outputTypes["name"] = tempOutNames;
65                 outputTypes["group"] = tempOutNames;
66                 outputTypes["qfile"] = tempOutNames;
67         outputTypes["flow"] = tempOutNames;
68         }
69         catch(exception& e) {
70                 m->errorOut(e, "SortSeqsCommand", "SortSeqsCommand");
71                 exit(1);
72         }
73 }
74 //**********************************************************************************************************************
75 SortSeqsCommand::SortSeqsCommand(string option)  {
76         try {
77                 abort = false; calledHelp = false;   
78                 
79                 //allow user to run help
80                 if(option == "help") { help(); abort = true; calledHelp = true; }
81                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
82                 
83                 else {
84                         vector<string> myArray = setParameters();
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter;
90                         map<string,string>::iterator it;
91                         
92                         //check to make sure all parameters are valid for command
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
95                         }
96                         
97                         //initialize outputTypes
98                         vector<string> tempOutNames;
99                         outputTypes["fasta"] = tempOutNames;
100                         outputTypes["taxonomy"] = tempOutNames;
101                         outputTypes["name"] = tempOutNames;
102                         outputTypes["group"] = tempOutNames;
103                         outputTypes["qfile"] = tempOutNames;
104             outputTypes["flow"] = tempOutNames;
105                         
106                         //if the user changes the output directory command factory will send this info to us in the output parameter 
107                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
108                         
109                         //if the user changes the input directory command factory will send this info to us in the output parameter 
110                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
111                         if (inputDir == "not found"){   inputDir = "";          }
112                         else {
113                                 string path;
114                                 it = parameters.find("fasta");
115                                 //user has given a template file
116                                 if(it != parameters.end()){ 
117                                         path = m->hasPath(it->second);
118                                         //if the user has not given a path then, add inputdir. else leave path alone.
119                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
120                                 }
121                                 
122                                 it = parameters.find("name");
123                                 //user has given a template file
124                                 if(it != parameters.end()){ 
125                                         path = m->hasPath(it->second);
126                                         //if the user has not given a path then, add inputdir. else leave path alone.
127                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
128                                 }
129                                 
130                                 it = parameters.find("group");
131                                 //user has given a template file
132                                 if(it != parameters.end()){ 
133                                         path = m->hasPath(it->second);
134                                         //if the user has not given a path then, add inputdir. else leave path alone.
135                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
136                                 }
137                                 
138                                 it = parameters.find("taxonomy");
139                                 //user has given a template file
140                                 if(it != parameters.end()){ 
141                                         path = m->hasPath(it->second);
142                                         //if the user has not given a path then, add inputdir. else leave path alone.
143                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
144                                 }
145                                 
146                                 it = parameters.find("qfile");
147                                 //user has given a template file
148                                 if(it != parameters.end()){ 
149                                         path = m->hasPath(it->second);
150                                         //if the user has not given a path then, add inputdir. else leave path alone.
151                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
152                                 }
153                 
154                 it = parameters.find("accnos");
155                                 //user has given a template file
156                                 if(it != parameters.end()){ 
157                                         path = m->hasPath(it->second);
158                                         //if the user has not given a path then, add inputdir. else leave path alone.
159                                         if (path == "") {       parameters["accnos"] = inputDir + it->second;           }
160                                 }
161                 
162                 it = parameters.find("flow");
163                                 //user has given a template file
164                                 if(it != parameters.end()){ 
165                                         path = m->hasPath(it->second);
166                                         //if the user has not given a path then, add inputdir. else leave path alone.
167                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
168                                 }
169                         }
170             
171                         
172                         //check for parameters
173             accnosfile = validParameter.validFile(parameters, "accnos", true);
174                         if (accnosfile == "not open") { accnosfile = ""; abort = true; }
175                         else if (accnosfile == "not found") {  accnosfile = "";  }      
176                         else { m->setAccnosFile(accnosfile); }
177             
178                         fastafile = validParameter.validFile(parameters, "fasta", true);
179                         if (fastafile == "not open") { fastafile = ""; abort = true; }
180                         else if (fastafile == "not found") {  fastafile = "";  }        
181                         else { m->setFastaFile(fastafile); }
182             
183             flowfile = validParameter.validFile(parameters, "flow", true);
184                         if (flowfile == "not open") { flowfile = ""; abort = true; }
185                         else if (flowfile == "not found") {  flowfile = "";  }  
186                         else { m->setFlowFile(flowfile); }
187             
188                         namefile = validParameter.validFile(parameters, "name", true);
189                         if (namefile == "not open") { namefile = ""; abort = true; }
190                         else if (namefile == "not found") {  namefile = "";  }  
191                         else { m->setNameFile(namefile); } 
192             
193                         groupfile = validParameter.validFile(parameters, "group", true);
194                         if (groupfile == "not open") { abort = true; }
195                         else if (groupfile == "not found") {  groupfile = "";  }
196                         else { m->setGroupFile(groupfile); }
197                         
198                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
199                         if (taxfile == "not open") { abort = true; }
200                         else if (taxfile == "not found") {  taxfile = "";  }
201                         else { m->setTaxonomyFile(taxfile); }
202                         
203                         qualfile = validParameter.validFile(parameters, "qfile", true);
204                         if (qualfile == "not open") { abort = true; }
205                         else if (qualfile == "not found") {  qualfile = "";  }                  
206                         else { m->setQualFile(qualfile); }
207                         
208             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") { temp = "f"; }
209                         large = m->isTrue(temp);
210             
211                         if ((fastafile == "") && (namefile == "") && (groupfile == "") && (taxfile == "") && (flowfile == "") && (qualfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, flow or quality."); m->mothurOutEndLine(); abort = true; }
212                         
213                         if ((fastafile != "") && (namefile == "")) {
214                                 vector<string> files; files.push_back(fastafile);
215                                 parser.getNameFile(files);
216                         }
217                 }
218         
219         }
220         catch(exception& e) {
221                 m->errorOut(e, "SortSeqsCommand", "SortSeqsCommand");
222                 exit(1);
223         }
224 }
225 //**********************************************************************************************************************
226
227 int SortSeqsCommand::execute(){
228         try {
229                 
230                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
231                 
232                 //read through the correct file and output lines you want to keep
233         if (accnosfile != "")           {               readAccnos();   }
234                 if (fastafile != "")            {               readFasta();    }
235         if (flowfile != "")         {           readFlow();     }
236         if (qualfile != "")                     {               readQual();             }
237         if (namefile != "")                     {               readName();             }
238                 if (groupfile != "")            {               readGroup();    }
239         if (taxfile != "")                      {               readTax();              }
240                 
241                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
242         
243                 if (outputNames.size() != 0) {
244                         m->mothurOutEndLine();
245                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
246                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
247                         m->mothurOutEndLine();
248                         
249                         //set fasta file as new current fastafile
250                         string current = "";
251                         itTypes = outputTypes.find("fasta");
252                         if (itTypes != outputTypes.end()) {
253                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
254                         }
255                         
256                         itTypes = outputTypes.find("name");
257                         if (itTypes != outputTypes.end()) {
258                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
259                         }
260                         
261                         itTypes = outputTypes.find("group");
262                         if (itTypes != outputTypes.end()) {
263                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
264                         }
265                         
266                         
267                         itTypes = outputTypes.find("taxonomy");
268                         if (itTypes != outputTypes.end()) {
269                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
270                         }
271                         
272                         itTypes = outputTypes.find("qfile");
273                         if (itTypes != outputTypes.end()) {
274                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
275                         }       
276             
277             itTypes = outputTypes.find("flow");
278                         if (itTypes != outputTypes.end()) {
279                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFlowFile(current); }
280                         }       
281                 }
282                 
283                 return 0;               
284         }
285     
286         catch(exception& e) {
287                 m->errorOut(e, "SortSeqsCommand", "execute");
288                 exit(1);
289         }
290 }
291
292 //**********************************************************************************************************************
293 int SortSeqsCommand::readFasta(){
294         try {
295                 string thisOutputDir = outputDir;
296                 if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
297                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "sorted" + m->getExtension(fastafile);
298                 outputTypes["fasta"].push_back(outputFileName);  outputNames.push_back(outputFileName);
299         
300                 ofstream out;
301                 m->openOutputFile(outputFileName, out);
302                 
303                 ifstream in;
304                 m->openInputFile(fastafile, in);
305                 string name;
306                 
307         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
308             
309             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
310                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
311                 //this way we only store 1000 seqs in memory at a time.
312                 
313                 int numNames = names.size();
314                 int numNamesInFile = 0;
315                 
316                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
317                 while(!in.eof()){
318                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
319                     
320                     Sequence currSeq(in);
321                     name = currSeq.getName();
322                     
323                     if (name != "") {
324                         numNamesInFile++;
325                         map<string, int>::iterator it = names.find(name);
326                         if (it == names.end()) { 
327                             names[name] = numNames; numNames++;
328                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
329                         }
330                     }
331                     m->gobble(in);
332                 }
333                 in.close();
334                 out.close();
335                 
336                 int numLeft = names.size();
337                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
338                 
339                 int size = 1000; //assume that user can hold 1000 seqs in memory
340                 if (numLeft < size) { size = numLeft; }
341                 int times = 0;
342                 
343                 vector<Sequence> seqs; seqs.resize(size);
344                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
345                 
346                 while (numLeft > 0) {
347                     
348                     ifstream in2;
349                     m->openInputFile(fastafile, in2);
350                     
351                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
352                     
353                     int found = 0;
354                     int needToFind = size;
355                     if (numLeft < size) { needToFind = numLeft; }
356                     
357                     while(!in2.eof()){
358                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
359                         
360                         //stop reading if we already found the seqs we are looking for
361                         if (found >= needToFind) { break; }
362                         
363                         Sequence currSeq(in2);
364                         name = currSeq.getName();
365                         
366                         if (name != "") {
367                             map<string, int>::iterator it = names.find(name);
368                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
369                                 //is it in the set of seqs we are looking for this time around
370                                 int thisSeqsPlace = it->second;
371                                 thisSeqsPlace -= (times * size);
372                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
373                                     seqs[thisSeqsPlace] = currSeq; 
374                                     found++;
375                                 }
376                             }else { m->mothurOut("[ERROR]: in logic of readFasta function.\n"); m->control_pressed = true; }
377                         }
378                         m->gobble(in2);
379                     }
380                     in2.close();        
381
382                     ofstream out2;
383                     m->openOutputFileAppend(outputFileName, out2);
384                     
385                     int output = seqs.size();
386                     if (numLeft < seqs.size()) { output = numLeft; }
387                         
388                     for (int i = 0; i < output; i++) {
389                         if (seqs[i].getName() != "") { seqs[i].printSequence(out2); }
390                     }
391                     out2.close();
392                     
393                     times++;
394                     numLeft -= output;
395                 }
396                 
397                 m->mothurOut("Ordered " + toString(numNamesInFile) + " sequences from " + fastafile + ".\n");
398             }else {
399                 
400                 vector<Sequence> seqs; seqs.resize(names.size());
401                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
402                 
403                 while(!in.eof()){
404                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
405                     
406                     Sequence currSeq(in);
407                     name = currSeq.getName();
408                     
409                     if (name != "") {
410                         map<string, int>::iterator it = names.find(name);
411                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
412                             seqs[it->second] = currSeq;  
413                         }else { //if we cant find it then add it to the end
414                             names[name] = seqs.size();
415                             seqs.push_back(currSeq);
416                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
417                         }
418                     }
419                     m->gobble(in);
420                 }
421                 in.close();     
422                 
423                 int count = 0;
424                 for (int i = 0; i < seqs.size(); i++) {
425                     if (seqs[i].getName() != "") {
426                         seqs[i].printSequence(out); count++;
427                     }
428                 }
429                 out.close();
430                 
431                 m->mothurOut("Ordered " + toString(count) + " sequences from " + fastafile + ".\n");
432             }
433                         
434         }else { //read in file to fill names
435             int count = 0;
436             
437             while(!in.eof()){
438                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
439                 
440                 Sequence currSeq(in);
441                 name = currSeq.getName();
442                 
443                 if (name != "") {
444                     //if this name is in the accnos file
445                     names[name] = count;
446                     count++;
447                     currSeq.printSequence(out);
448                 }
449                 m->gobble(in);
450             }
451             in.close(); 
452             out.close();
453             
454             m->mothurOut("\nUsing " + fastafile + " to determine the order. It contains " + toString(count) + " sequences.\n");
455         }
456                                 
457                 return 0;
458                 
459         }
460         catch(exception& e) {
461                 m->errorOut(e, "SortSeqsCommand", "readFasta");
462                 exit(1);
463         }
464 }
465 //**********************************************************************************************************************
466 int SortSeqsCommand::readFlow(){
467         try {
468                 string thisOutputDir = outputDir;
469                 if (outputDir == "") {  thisOutputDir += m->hasPath(flowfile);  }
470                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowfile)) + "sorted" + m->getExtension(flowfile);
471                 outputTypes["flow"].push_back(outputFileName);  outputNames.push_back(outputFileName);
472         
473                 ofstream out;
474                 m->openOutputFile(outputFileName, out);
475                 
476                 ifstream in;
477                 m->openInputFile(flowfile, in);
478         int numFlows;
479                 string name;
480         
481         in >> numFlows; m->gobble(in);
482                 
483         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
484             
485             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
486                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
487                 //this way we only store 1000 seqs in memory at a time.
488                 
489                 int numNames = names.size();
490                 int numNamesInFile = 0;
491                 
492                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
493                 while(!in.eof()){
494                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
495                     
496                     in >> name; 
497                     string rest = m->getline(in);
498                     
499                     if (name != "") {
500                         numNamesInFile++;
501                         map<string, int>::iterator it = names.find(name);
502                         if (it == names.end()) { 
503                             names[name] = numNames; numNames++;
504                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
505                         }
506                     }
507                     m->gobble(in);
508                 }
509                 in.close();
510                 out.close();
511                 
512                 int numLeft = names.size();
513                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
514                 
515                 int size = 1000; //assume that user can hold 1000 seqs in memory
516                 if (numLeft < size) { size = numLeft; }
517                 int times = 0;
518                 
519                 vector<string> seqs; seqs.resize(size, "");
520                 
521                 while (numLeft > 0) {
522                     
523                     ifstream in2;
524                     m->openInputFile(flowfile, in2); in2 >> numFlows; m->gobble(in2);
525                     
526                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
527                     
528                     int found = 0;
529                     int needToFind = size;
530                     if (numLeft < size) { needToFind = numLeft; }
531                     
532                     while(!in2.eof()){
533                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
534                         
535                         //stop reading if we already found the seqs we are looking for
536                         if (found >= needToFind) { break; }
537                         
538                         in2 >> name;    
539                         string rest = m->getline(in2);
540                         
541                         if (name != "") {
542                             map<string, int>::iterator it = names.find(name);
543                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
544                                 //is it in the set of seqs we are looking for this time around
545                                 int thisSeqsPlace = it->second;
546                                 thisSeqsPlace -= (times * size);
547                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
548                                     seqs[thisSeqsPlace] = (name +'\t' + rest); 
549                                     found++;
550                                 }
551                             }else { m->mothurOut("[ERROR]: in logic of readFlow function.\n"); m->control_pressed = true; }
552                         }
553                         m->gobble(in2);
554                     }
555                     in2.close();        
556                     
557                     ofstream out2;
558                     m->openOutputFileAppend(outputFileName, out2);
559                     
560                     int output = seqs.size();
561                     if (numLeft < seqs.size()) { output = numLeft; }
562                     
563                     for (int i = 0; i < output; i++) {
564                         if (seqs[i] != "") {
565                             out2 << seqs[i] << endl;
566                         }
567                     }
568                     out2.close();
569                     
570                     times++;
571                     numLeft -= output;
572                 }
573                 
574                 m->mothurOut("Ordered " + toString(numNamesInFile) + " flows from " + flowfile + ".\n");
575             }else {
576                 
577                 vector<string> seqs; seqs.resize(names.size(), "");
578                 
579                 while(!in.eof()){
580                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
581                     
582                     in >> name; 
583                     string rest = m->getline(in);
584                     
585                     if (name != "") {
586                         map<string, int>::iterator it = names.find(name);
587                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
588                             seqs[it->second] = (name + '\t' + rest);  
589                         }else { //if we cant find it then add it to the end
590                             names[name] = seqs.size();
591                             seqs.push_back((name + '\t' + rest));
592                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
593                         }
594                     }
595                     m->gobble(in);
596                 }
597                 in.close();     
598                 
599                 int count = 0;
600                 for (int i = 0; i < seqs.size(); i++) {
601                     if (seqs[i] != "") {
602                         out << seqs[i] << endl;
603                         count++;
604                     }
605                 }
606                 out.close();
607                 
608                 m->mothurOut("Ordered " + toString(count) + " flows from " + flowfile + ".\n");
609             }
610             
611         }else { //read in file to fill names
612             int count = 0;
613             
614             while(!in.eof()){
615                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
616                 
617                 in >> name;     
618                 string rest = m->getline(in);
619                 
620                 if (name != "") {
621                     //if this name is in the accnos file
622                     names[name] = count;
623                     count++;
624                     out << name << '\t' << rest << endl;
625                 }
626                 m->gobble(in);
627             }
628             in.close(); 
629             out.close();
630             
631             m->mothurOut("\nUsing " + flowfile + " to determine the order. It contains " + toString(count) + " flows.\n");
632         }
633         
634                 return 0;
635                 
636         }
637         catch(exception& e) {
638                 m->errorOut(e, "SortSeqsCommand", "readFlow");
639                 exit(1);
640         }
641 }
642
643 //**********************************************************************************************************************
644 int SortSeqsCommand::readQual(){
645         try {
646                 string thisOutputDir = outputDir;
647                 if (outputDir == "") {  thisOutputDir += m->hasPath(qualfile);  }
648                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(qualfile)) + "sorted" +  m->getExtension(qualfile);
649         outputTypes["qfile"].push_back(outputFileName);  outputNames.push_back(outputFileName);
650         
651                 ofstream out;
652                 m->openOutputFile(outputFileName, out);
653                 
654                 ifstream in;
655                 m->openInputFile(qualfile, in);
656                 string name;
657                 
658         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
659             
660             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
661                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
662                 //this way we only store 1000 seqs in memory at a time.
663                 
664                 int numNames = names.size();
665                 int numNamesInFile = 0;
666                 
667                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
668                 while(!in.eof()){
669                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
670                     
671                     QualityScores currQual;
672                     currQual = QualityScores(in); 
673                     name = currQual.getName();
674                     
675                     if (name != "") {
676                         numNamesInFile++;
677                         map<string, int>::iterator it = names.find(name);
678                         if (it == names.end()) { 
679                             names[name] = numNames; numNames++;
680                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
681                         }
682                     }
683                     m->gobble(in);
684                 }
685                 in.close();
686                 out.close();
687                 
688                 int numLeft = names.size();
689                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
690                 
691                 int size = 1000; //assume that user can hold 1000 seqs in memory
692                 if (numLeft < size) { size = numLeft; }
693                 int times = 0;
694
695                 
696                 vector<QualityScores> seqs; seqs.resize(size);
697                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
698                 
699                 while (numLeft > 0) {
700                     
701                     ifstream in2;
702                     m->openInputFile(qualfile, in2);
703                     
704                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
705                     
706                     int found = 0;
707                     int needToFind = size;
708                     if (numLeft < size) { needToFind = numLeft; }
709                     
710                     while(!in2.eof()){
711                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
712                         
713                         //stop reading if we already found the seqs we are looking for
714                         if (found >= needToFind) { break; }
715                         
716                         QualityScores currQual;
717                         currQual = QualityScores(in2); 
718                         name = currQual.getName();
719                         
720                         if (name != "") {
721                             map<string, int>::iterator it = names.find(name);
722                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
723                                 //is it in the set of seqs we are looking for this time around
724                                 int thisSeqsPlace = it->second;
725                                 thisSeqsPlace -= (times * size);
726                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
727                                     seqs[thisSeqsPlace] = currQual; 
728                                     found++;
729                                 }
730                             }else { m->mothurOut("[ERROR]: in logic of readQual function.\n"); m->control_pressed = true; }
731                         }
732                         m->gobble(in2);
733                     }
734                     in2.close();        
735                     
736                     ofstream out2;
737                     m->openOutputFileAppend(outputFileName, out2);
738                     
739                     int output = seqs.size();
740                     if (numLeft < seqs.size()) { output = numLeft; }
741                     
742                     for (int i = 0; i < output; i++) {
743                         if (seqs[i].getName() != "") {
744                             seqs[i].printQScores(out2);
745                         }
746                     }
747                     out2.close();
748                     
749                     times++;
750                     numLeft -= output;
751                 }
752                 
753                  m->mothurOut("Ordered " + toString(numNamesInFile) + " sequences from " + qualfile + ".\n");
754                 
755             }else {
756                 
757                 vector<QualityScores> seqs; seqs.resize(names.size());
758                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
759                 
760                 while(!in.eof()){
761                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
762                     
763                     QualityScores currQual;
764                     currQual = QualityScores(in); 
765                     name = currQual.getName();
766                     
767                     if (name != "") {
768                         map<string, int>::iterator it = names.find(name);
769                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
770                             seqs[it->second] = currQual;  
771                         }else { //if we cant find it then add it to the end
772                             names[name] = seqs.size();
773                             seqs.push_back(currQual);
774                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
775                         }
776                     }
777                     m->gobble(in);
778                 }
779                 in.close();     
780                 
781                 int count = 0;
782                 for (int i = 0; i < seqs.size(); i++) {
783                     if (seqs[i].getName() != "") { seqs[i].printQScores(out); count++; }
784                 }
785                 out.close();
786                 
787                 m->mothurOut("Ordered " + toString(count) + " sequences from " + qualfile + ".\n");
788             }
789             
790         }else { //read in file to fill names
791             int count = 0;
792             
793             while(!in.eof()){
794                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
795                 
796                 QualityScores currQual;
797                 currQual = QualityScores(in);  
798                                
799                 m->gobble(in);
800                 
801                 if (currQual.getName() != "") {
802                     //if this name is in the accnos file
803                     names[currQual.getName()] = count;
804                     count++;
805                     currQual.printQScores(out);
806                 }
807                 m->gobble(in);
808             }
809             in.close(); 
810             out.close();
811             
812             m->mothurOut("\nUsing " + qualfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
813         }
814                 
815                 return 0;
816                 
817         }
818         catch(exception& e) {
819                 m->errorOut(e, "SortSeqsCommand", "readQual");
820                 exit(1);
821         }
822 }
823 //**********************************************************************************************************************
824 int SortSeqsCommand::readName(){
825         try {
826                 string thisOutputDir = outputDir;
827                 if (outputDir == "") {  thisOutputDir += m->hasPath(namefile);  }
828                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "sorted" + m->getExtension(namefile);
829         outputTypes["name"].push_back(outputFileName);  outputNames.push_back(outputFileName);
830         
831                 ofstream out;
832                 m->openOutputFile(outputFileName, out);
833         
834                 ifstream in;
835                 m->openInputFile(namefile, in);
836                 string name, firstCol, secondCol;
837                 
838         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
839         
840                 vector<string> seqs; seqs.resize(names.size(), "");
841                 
842                 while(!in.eof()){
843                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
844                     
845                     in >> firstCol;             m->gobble(in);          
846                     in >> secondCol;    m->gobble(in);
847                     
848                     if (firstCol != "") {
849                         map<string, int>::iterator it = names.find(firstCol);
850                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
851                             seqs[it->second] = firstCol + '\t' + secondCol;  
852                         }else { //if we cant find it then add it to the end
853                             names[firstCol] = seqs.size();
854                             seqs.push_back((firstCol + '\t' + secondCol));
855                             m->mothurOut(firstCol + " was not in the contained the file which determined the order, adding it to the end.\n");
856                         }
857                     }
858                 }
859                 in.close();     
860                 
861                 int count = 0;
862                 for (int i = 0; i < seqs.size(); i++) {
863                     if (seqs[i] != "") { out << seqs[i] << endl; count++; }
864                 }
865                 out.close();
866                 
867                 m->mothurOut("Ordered " + toString(count) + " sequences from " + namefile + ".\n");
868             
869         }else { //read in file to fill names
870             int count = 0;
871             
872             while(!in.eof()){
873                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
874                 
875                 in >> firstCol;         m->gobble(in);          
876                 in >> secondCol;    m->gobble(in);
877                 
878                 if (firstCol != "") {
879                     //if this name is in the accnos file
880                     names[firstCol] = count;
881                     count++;
882                     out << firstCol << '\t' << secondCol << endl;
883                 }
884                 m->gobble(in);
885             }
886             in.close(); 
887             out.close();
888             
889             m->mothurOut("\nUsing " + namefile + " to determine the order. It contains " + toString(count) + " representative sequences.\n");
890         }
891                                 
892                 return 0;
893         }
894         catch(exception& e) {
895                 m->errorOut(e, "SortSeqsCommand", "readName");
896                 exit(1);
897         }
898 }
899
900 //**********************************************************************************************************************
901 int SortSeqsCommand::readGroup(){
902         try {
903                 string thisOutputDir = outputDir;
904                 if (outputDir == "") {  thisOutputDir += m->hasPath(groupfile);  }
905                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pick" + m->getExtension(groupfile);
906                 outputTypes["group"].push_back(outputFileName);  outputNames.push_back(outputFileName);
907         
908                 ofstream out;
909                 m->openOutputFile(outputFileName, out);
910         
911                 ifstream in;
912                 m->openInputFile(groupfile, in);
913                 string name, group;
914                 
915                 if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
916             
917             vector<string> seqs; seqs.resize(names.size(), "");
918             
919             while(!in.eof()){
920                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
921                 
922                 in >> name;             m->gobble(in);          
923                 in >> group;    m->gobble(in);
924                 
925                 if (name != "") {
926                     map<string, int>::iterator it = names.find(name);
927                     if (it != names.end()) { //we found it, so put it in the vector in the right place.
928                         seqs[it->second] = name + '\t' + group;  
929                     }else { //if we cant find it then add it to the end
930                         names[name] = seqs.size();
931                         seqs.push_back((name + '\t' + group));
932                         m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
933                     }
934                 }
935             }
936             in.close(); 
937             
938             int count = 0;
939             for (int i = 0; i < seqs.size(); i++) {
940                 if (seqs[i] != "") { out << seqs[i] << endl; count++; }
941             }
942             out.close();
943             
944             m->mothurOut("Ordered " + toString(count) + " sequences from " + groupfile + ".\n");
945             
946         }else { //read in file to fill names
947             int count = 0;
948             
949             while(!in.eof()){
950                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
951                 
952                 in >> name;             m->gobble(in);          
953                 in >> group;    m->gobble(in);
954                 
955                 if (name != "") {
956                     //if this name is in the accnos file
957                     names[name] = count;
958                     count++;
959                     out << name << '\t' << group << endl;
960                 }
961                 m->gobble(in);
962             }
963             in.close(); 
964             out.close();
965             
966             m->mothurOut("\nUsing " + groupfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
967         }
968         
969                 return 0;
970         }
971         catch(exception& e) {
972                 m->errorOut(e, "SortSeqsCommand", "readGroup");
973                 exit(1);
974         }
975 }
976 //**********************************************************************************************************************
977 int SortSeqsCommand::readTax(){
978         try {
979                 string thisOutputDir = outputDir;
980                 if (outputDir == "") {  thisOutputDir += m->hasPath(taxfile);  }
981                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pick" + m->getExtension(taxfile);
982         outputTypes["taxonomy"].push_back(outputFileName);  outputNames.push_back(outputFileName);
983         
984                 ofstream out;
985                 m->openOutputFile(outputFileName, out);
986         
987                 ifstream in;
988                 m->openInputFile(taxfile, in);
989                 string name, tax;
990                 
991                 if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
992             
993             vector<string> seqs; seqs.resize(names.size(), "");
994             
995             while(!in.eof()){
996                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
997                 
998                 in >> name;             m->gobble(in);          
999                 in >> tax;    m->gobble(in);
1000                 
1001                 if (name != "") {
1002                     map<string, int>::iterator it = names.find(name);
1003                     if (it != names.end()) { //we found it, so put it in the vector in the right place.
1004                         seqs[it->second] = name + '\t' + tax;  
1005                     }else { //if we cant find it then add it to the end
1006                         names[name] = seqs.size();
1007                         seqs.push_back((name + '\t' + tax));
1008                         m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
1009                     }
1010                 }
1011             }
1012             in.close(); 
1013             
1014             int count = 0;
1015             for (int i = 0; i < seqs.size(); i++) {
1016                 if (seqs[i] != "") { out << seqs[i] << endl; count++; }
1017             }
1018             out.close();
1019             
1020             m->mothurOut("Ordered " + toString(count) + " sequences from " + taxfile + ".\n");
1021             
1022         }else { //read in file to fill names
1023             int count = 0;
1024             
1025             while(!in.eof()){
1026                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1027                 
1028                 in >> name;             m->gobble(in);          
1029                 in >> tax;    m->gobble(in);
1030                 
1031                 if (name != "") {
1032                     //if this name is in the accnos file
1033                     names[name] = count;
1034                     count++;
1035                     out << name << '\t' << tax << endl;
1036                 }
1037                 m->gobble(in);
1038             }
1039             in.close(); 
1040             out.close();
1041             
1042             m->mothurOut("\nUsing " + taxfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
1043         }
1044         
1045                 return 0;
1046                 return 0;
1047         }
1048         catch(exception& e) {
1049                 m->errorOut(e, "SortSeqsCommand", "readTax");
1050                 exit(1);
1051         }
1052 }
1053 //**********************************************************************************************************************
1054 int SortSeqsCommand::readAccnos(){
1055         try {
1056                 
1057                 ifstream in;
1058                 m->openInputFile(accnosfile, in);
1059                 string name;
1060         int count = 0;
1061                 
1062                 while(!in.eof()){
1063             
1064             if (m->control_pressed) { break; }
1065             
1066                         in >> name; m->gobble(in);
1067             
1068             if (name != "") {
1069                 names[name] = count;
1070                 count++;
1071             }
1072                 }
1073                 in.close();             
1074         
1075         m->mothurOut("\nUsing " + accnosfile + " to determine the order. It contains " + toString(count) + " representative sequences.\n");
1076         
1077         return 0;
1078         }
1079         catch(exception& e) {
1080                 m->errorOut(e, "SortSeqsCommand", "readAccnos");
1081                 exit(1);
1082         }
1083 }
1084
1085 //**********************************************************************************************************************
1086
1087
1088
1089
1090