]> git.donarmstrong.com Git - mothur.git/blob - sortseqscommand.cpp
changes while testing
[mothur.git] / sortseqscommand.cpp
1 //
2 //  sortseqscommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 2/3/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "sortseqscommand.h"
10 #include "sequence.hpp"
11 #include "qualityscores.h"
12
13 //**********************************************************************************************************************
14 vector<string> SortSeqsCommand::setParameters(){        
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none","fasta",false,false); parameters.push_back(pfasta);
17         CommandParameter pflow("flow", "InputTypes", "", "", "none", "FNGLT", "none","flow",false,false); parameters.push_back(pflow);
18         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none","name",false,false); parameters.push_back(pname);
19         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none","count",false,false); parameters.push_back(pcount);
20                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none","group",false,false); parameters.push_back(pgroup);
21                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none","taxonomy",false,false); parameters.push_back(ptaxonomy);
22                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "FNGLT", "none","qfile",false,false); parameters.push_back(pqfile);
23                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
24                 CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(paccnos);
25         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "SortSeqsCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string SortSeqsCommand::getHelpString(){        
39         try {
40                 string helpString = "";
41                 helpString += "The sort.seqs command puts the sequences in the same order for the following file types: accnos fasta, name, group, count, taxonomy, flow or quality file.\n";
42         helpString += "The sort.seqs command parameters are accnos, fasta, name, group, count, taxonomy, flow, qfile and large.\n";
43         helpString += "The accnos file allows you to specify the order you want the files in.  If none is provided, mothur will use the order of the first file it reads.\n";
44         helpString += "The large parameters is used to indicate your files are too large to fit in RAM.\n";
45                 helpString += "The sort.seqs command should be in the following format: sort.seqs(fasta=yourFasta).\n";
46                 helpString += "Example sort.seqs(fasta=amazon.fasta).\n";
47                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
48                 return helpString;
49         }
50         catch(exception& e) {
51                 m->errorOut(e, "SortSeqsCommand", "getHelpString");
52                 exit(1);
53         }
54 }
55 //**********************************************************************************************************************
56 string SortSeqsCommand::getOutputPattern(string type) {
57     try {
58         string pattern = "";
59         
60         if (type == "fasta")            {   pattern = "[filename],sorted,[extension]";    }
61         else if (type == "taxonomy")    {   pattern = "[filename],sorted,[extension]";    }
62         else if (type == "name")        {   pattern = "[filename],sorted,[extension]";    }
63         else if (type == "group")       {   pattern = "[filename],sorted,[extension]";    }
64         else if (type == "count")       {   pattern = "[filename],sorted,[extension]";    }
65         else if (type == "flow")        {   pattern = "[filename],sorted,[extension]";    }
66         else if (type == "qfile")      {   pattern = "[filename],sorted,[extension]";    }
67         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
68         
69         return pattern;
70     }
71     catch(exception& e) {
72         m->errorOut(e, "SortSeqsCommand", "getOutputPattern");
73         exit(1);
74     }
75 }
76 //**********************************************************************************************************************
77 SortSeqsCommand::SortSeqsCommand(){     
78         try {
79                 abort = true; calledHelp = true; 
80                 setParameters();
81                 vector<string> tempOutNames;
82                 outputTypes["fasta"] = tempOutNames;
83                 outputTypes["taxonomy"] = tempOutNames;
84                 outputTypes["name"] = tempOutNames;
85         outputTypes["count"] = tempOutNames;
86                 outputTypes["group"] = tempOutNames;
87                 outputTypes["qfile"] = tempOutNames;
88         outputTypes["flow"] = tempOutNames;
89         }
90         catch(exception& e) {
91                 m->errorOut(e, "SortSeqsCommand", "SortSeqsCommand");
92                 exit(1);
93         }
94 }
95 //**********************************************************************************************************************
96 SortSeqsCommand::SortSeqsCommand(string option)  {
97         try {
98                 abort = false; calledHelp = false;   
99                 
100                 //allow user to run help
101                 if(option == "help") { help(); abort = true; calledHelp = true; }
102                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
103                 
104                 else {
105                         vector<string> myArray = setParameters();
106                         
107                         OptionParser parser(option);
108                         map<string,string> parameters = parser.getParameters();
109                         
110                         ValidParameters validParameter;
111                         map<string,string>::iterator it;
112                         
113                         //check to make sure all parameters are valid for command
114                         for (it = parameters.begin(); it != parameters.end(); it++) { 
115                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
116                         }
117                         
118                         //initialize outputTypes
119                         vector<string> tempOutNames;
120                         outputTypes["fasta"] = tempOutNames;
121                         outputTypes["taxonomy"] = tempOutNames;
122                         outputTypes["name"] = tempOutNames;
123                         outputTypes["group"] = tempOutNames;
124                         outputTypes["qfile"] = tempOutNames;
125             outputTypes["flow"] = tempOutNames;
126             outputTypes["count"] = tempOutNames;
127                         
128                         //if the user changes the output directory command factory will send this info to us in the output parameter 
129                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
130                         
131                         //if the user changes the input directory command factory will send this info to us in the output parameter 
132                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
133                         if (inputDir == "not found"){   inputDir = "";          }
134                         else {
135                                 string path;
136                                 it = parameters.find("fasta");
137                                 //user has given a template file
138                                 if(it != parameters.end()){ 
139                                         path = m->hasPath(it->second);
140                                         //if the user has not given a path then, add inputdir. else leave path alone.
141                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
142                                 }
143                                 
144                                 it = parameters.find("name");
145                                 //user has given a template file
146                                 if(it != parameters.end()){ 
147                                         path = m->hasPath(it->second);
148                                         //if the user has not given a path then, add inputdir. else leave path alone.
149                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
150                                 }
151                                 
152                                 it = parameters.find("group");
153                                 //user has given a template file
154                                 if(it != parameters.end()){ 
155                                         path = m->hasPath(it->second);
156                                         //if the user has not given a path then, add inputdir. else leave path alone.
157                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
158                                 }
159                                 
160                                 it = parameters.find("taxonomy");
161                                 //user has given a template file
162                                 if(it != parameters.end()){ 
163                                         path = m->hasPath(it->second);
164                                         //if the user has not given a path then, add inputdir. else leave path alone.
165                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
166                                 }
167                                 
168                                 it = parameters.find("qfile");
169                                 //user has given a template file
170                                 if(it != parameters.end()){ 
171                                         path = m->hasPath(it->second);
172                                         //if the user has not given a path then, add inputdir. else leave path alone.
173                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
174                                 }
175                 
176                 it = parameters.find("accnos");
177                                 //user has given a template file
178                                 if(it != parameters.end()){ 
179                                         path = m->hasPath(it->second);
180                                         //if the user has not given a path then, add inputdir. else leave path alone.
181                                         if (path == "") {       parameters["accnos"] = inputDir + it->second;           }
182                                 }
183                 
184                 it = parameters.find("flow");
185                                 //user has given a template file
186                                 if(it != parameters.end()){ 
187                                         path = m->hasPath(it->second);
188                                         //if the user has not given a path then, add inputdir. else leave path alone.
189                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
190                                 }
191                 
192                 it = parameters.find("count");
193                                 //user has given a template file
194                                 if(it != parameters.end()){ 
195                                         path = m->hasPath(it->second);
196                                         //if the user has not given a path then, add inputdir. else leave path alone.
197                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
198                                 }
199                         }
200             
201                         
202                         //check for parameters
203             accnosfile = validParameter.validFile(parameters, "accnos", true);
204                         if (accnosfile == "not open") { accnosfile = ""; abort = true; }
205                         else if (accnosfile == "not found") {  accnosfile = "";  }      
206                         else { m->setAccnosFile(accnosfile); }
207             
208                         fastafile = validParameter.validFile(parameters, "fasta", true);
209                         if (fastafile == "not open") { fastafile = ""; abort = true; }
210                         else if (fastafile == "not found") {  fastafile = "";  }        
211                         else { m->setFastaFile(fastafile); }
212             
213             flowfile = validParameter.validFile(parameters, "flow", true);
214                         if (flowfile == "not open") { flowfile = ""; abort = true; }
215                         else if (flowfile == "not found") {  flowfile = "";  }  
216                         else { m->setFlowFile(flowfile); }
217             
218                         namefile = validParameter.validFile(parameters, "name", true);
219                         if (namefile == "not open") { namefile = ""; abort = true; }
220                         else if (namefile == "not found") {  namefile = "";  }  
221                         else { m->setNameFile(namefile); } 
222             
223                         groupfile = validParameter.validFile(parameters, "group", true);
224                         if (groupfile == "not open") { abort = true; }
225                         else if (groupfile == "not found") {  groupfile = "";  }
226                         else { m->setGroupFile(groupfile); }
227                         
228                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
229                         if (taxfile == "not open") { abort = true; }
230                         else if (taxfile == "not found") {  taxfile = "";  }
231                         else { m->setTaxonomyFile(taxfile); }
232                         
233                         qualfile = validParameter.validFile(parameters, "qfile", true);
234                         if (qualfile == "not open") { abort = true; }
235                         else if (qualfile == "not found") {  qualfile = "";  }                  
236                         else { m->setQualFile(qualfile); }
237             
238             countfile = validParameter.validFile(parameters, "count", true);
239                         if (countfile == "not open") { countfile = ""; abort = true; }
240                         else if (countfile == "not found") { countfile = "";  } 
241                         else { m->setCountTableFile(countfile); }
242             
243             if ((namefile != "") && (countfile != "")) {
244                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
245             }
246                         
247             if ((groupfile != "") && (countfile != "")) {
248                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
249             }
250                         
251             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") { temp = "f"; }
252                         large = m->isTrue(temp);
253             
254                         if ((fastafile == "") && (namefile == "") && (countfile == "") && (groupfile == "") && (taxfile == "") && (flowfile == "") && (qualfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, group, count, taxonomy, flow or quality."); m->mothurOutEndLine(); abort = true; }
255                         
256             if (countfile == "") {
257                 if ((fastafile != "") && (namefile == "")) {
258                     vector<string> files; files.push_back(fastafile);
259                     parser.getNameFile(files);
260                 }
261             }
262                 }
263         
264         }
265         catch(exception& e) {
266                 m->errorOut(e, "SortSeqsCommand", "SortSeqsCommand");
267                 exit(1);
268         }
269 }
270 //**********************************************************************************************************************
271
272 int SortSeqsCommand::execute(){
273         try {
274                 
275                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
276                 
277                 //read through the correct file and output lines you want to keep
278         if (accnosfile != "")           {               
279             vector<string> temp;
280             m->readAccnos(accnosfile, temp);
281             for (int i = 0; i < temp.size(); i++) {  names[temp[i]] = i;  }
282             m->mothurOut("\nUsing " + accnosfile + " to determine the order. It contains " + toString(temp.size()) + " representative sequences.\n");   
283         }
284         
285                 if (fastafile != "")            {               readFasta();    }
286         if (flowfile != "")         {           readFlow();     }
287         if (qualfile != "")                     {               readQual();             }
288         if (namefile != "")                     {               readName();             }
289                 if (groupfile != "")            {               readGroup();    }
290         if (countfile != "")            {               readCount();    }
291         if (taxfile != "")                      {               readTax();              }
292                 
293                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
294         
295                 if (outputNames.size() != 0) {
296                         m->mothurOutEndLine();
297                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
298                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
299                         m->mothurOutEndLine();
300                         
301                         //set fasta file as new current fastafile
302                         string current = "";
303                         itTypes = outputTypes.find("fasta");
304                         if (itTypes != outputTypes.end()) {
305                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
306                         }
307                         
308                         itTypes = outputTypes.find("name");
309                         if (itTypes != outputTypes.end()) {
310                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
311                         }
312                         
313                         itTypes = outputTypes.find("group");
314                         if (itTypes != outputTypes.end()) {
315                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
316                         }
317                         
318                         
319                         itTypes = outputTypes.find("taxonomy");
320                         if (itTypes != outputTypes.end()) {
321                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
322                         }
323                         
324                         itTypes = outputTypes.find("qfile");
325                         if (itTypes != outputTypes.end()) {
326                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
327                         }       
328             
329             itTypes = outputTypes.find("flow");
330                         if (itTypes != outputTypes.end()) {
331                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFlowFile(current); }
332                         }
333             
334             itTypes = outputTypes.find("count");
335                         if (itTypes != outputTypes.end()) {
336                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
337                         }
338                 }
339                 
340                 return 0;               
341         }
342     
343         catch(exception& e) {
344                 m->errorOut(e, "SortSeqsCommand", "execute");
345                 exit(1);
346         }
347 }
348
349 //**********************************************************************************************************************
350 int SortSeqsCommand::readFasta(){
351         try {
352                 string thisOutputDir = outputDir;
353                 if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
354                 map<string, string> variables; 
355         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
356         variables["[extension]"] = m->getExtension(fastafile);
357                 string outputFileName = getOutputFileName("fasta", variables);
358                 outputTypes["fasta"].push_back(outputFileName);  outputNames.push_back(outputFileName);
359         
360                 ofstream out;
361                 m->openOutputFile(outputFileName, out);
362                 
363                 ifstream in;
364                 m->openInputFile(fastafile, in);
365                 string name;
366                 
367         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
368             
369             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
370                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
371                 //this way we only store 1000 seqs in memory at a time.
372                 
373                 int numNames = names.size();
374                 int numNamesInFile = 0;
375                 
376                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
377                 while(!in.eof()){
378                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
379                     
380                     Sequence currSeq(in);
381                     name = currSeq.getName();
382                     
383                     if (name != "") {
384                         numNamesInFile++;
385                         map<string, int>::iterator it = names.find(name);
386                         if (it == names.end()) { 
387                             names[name] = numNames; numNames++;
388                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
389                         }
390                     }
391                     m->gobble(in);
392                 }
393                 in.close();
394                 out.close();
395                 
396                 int numLeft = names.size();
397                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
398                 
399                 int size = 1000; //assume that user can hold 1000 seqs in memory
400                 if (numLeft < size) { size = numLeft; }
401                 int times = 0;
402                 
403                 vector<Sequence> seqs; seqs.resize(size);
404                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
405                 
406                 while (numLeft > 0) {
407                     
408                     ifstream in2;
409                     m->openInputFile(fastafile, in2);
410                     
411                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
412                     
413                     int found = 0;
414                     int needToFind = size;
415                     if (numLeft < size) { needToFind = numLeft; }
416                     
417                     while(!in2.eof()){
418                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
419                         
420                         //stop reading if we already found the seqs we are looking for
421                         if (found >= needToFind) { break; }
422                         
423                         Sequence currSeq(in2);
424                         name = currSeq.getName();
425                         
426                         if (name != "") {
427                             map<string, int>::iterator it = names.find(name);
428                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
429                                 //is it in the set of seqs we are looking for this time around
430                                 int thisSeqsPlace = it->second;
431                                 thisSeqsPlace -= (times * size);
432                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
433                                     seqs[thisSeqsPlace] = currSeq; 
434                                     found++;
435                                 }
436                             }else { m->mothurOut("[ERROR]: in logic of readFasta function.\n"); m->control_pressed = true; }
437                         }
438                         m->gobble(in2);
439                     }
440                     in2.close();        
441
442                     ofstream out2;
443                     m->openOutputFileAppend(outputFileName, out2);
444                     
445                     int output = seqs.size();
446                     if (numLeft < seqs.size()) { output = numLeft; }
447                         
448                     for (int i = 0; i < output; i++) {
449                         if (seqs[i].getName() != "") { seqs[i].printSequence(out2); }
450                     }
451                     out2.close();
452                     
453                     times++;
454                     numLeft -= output;
455                 }
456                 
457                 m->mothurOut("Ordered " + toString(numNamesInFile) + " sequences from " + fastafile + ".\n");
458             }else {
459                 
460                 vector<Sequence> seqs; seqs.resize(names.size());
461                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
462                 
463                 while(!in.eof()){
464                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
465                     
466                     Sequence currSeq(in);
467                     name = currSeq.getName();
468                     
469                     if (name != "") {
470                         map<string, int>::iterator it = names.find(name);
471                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
472                             seqs[it->second] = currSeq;  
473                         }else { //if we cant find it then add it to the end
474                             names[name] = seqs.size();
475                             seqs.push_back(currSeq);
476                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
477                         }
478                     }
479                     m->gobble(in);
480                 }
481                 in.close();     
482                 
483                 int count = 0;
484                 for (int i = 0; i < seqs.size(); i++) {
485                     if (seqs[i].getName() != "") {
486                         seqs[i].printSequence(out); count++;
487                     }
488                 }
489                 out.close();
490                 
491                 m->mothurOut("Ordered " + toString(count) + " sequences from " + fastafile + ".\n");
492             }
493                         
494         }else { //read in file to fill names
495             int count = 0;
496             
497             while(!in.eof()){
498                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
499                 
500                 Sequence currSeq(in);
501                 name = currSeq.getName();
502                 
503                 if (name != "") {
504                     //if this name is in the accnos file
505                     names[name] = count;
506                     count++;
507                     currSeq.printSequence(out);
508                 }
509                 m->gobble(in);
510             }
511             in.close(); 
512             out.close();
513             
514             m->mothurOut("\nUsing " + fastafile + " to determine the order. It contains " + toString(count) + " sequences.\n");
515         }
516                                 
517                 return 0;
518                 
519         }
520         catch(exception& e) {
521                 m->errorOut(e, "SortSeqsCommand", "readFasta");
522                 exit(1);
523         }
524 }
525 //**********************************************************************************************************************
526 int SortSeqsCommand::readFlow(){
527         try {
528                 string thisOutputDir = outputDir;
529                 if (outputDir == "") {  thisOutputDir += m->hasPath(flowfile);  }
530         map<string, string> variables; 
531         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowfile));
532         variables["[extension]"] = m->getExtension(flowfile);
533                 string outputFileName = getOutputFileName("flow", variables);
534                 outputTypes["flow"].push_back(outputFileName);  outputNames.push_back(outputFileName);
535         
536                 ofstream out;
537                 m->openOutputFile(outputFileName, out);
538                 
539                 ifstream in;
540                 m->openInputFile(flowfile, in);
541         int numFlows;
542                 string name;
543         
544         in >> numFlows; m->gobble(in);
545                 
546         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
547             
548             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
549                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
550                 //this way we only store 1000 seqs in memory at a time.
551                 
552                 int numNames = names.size();
553                 int numNamesInFile = 0;
554                 
555                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
556                 while(!in.eof()){
557                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
558                     
559                     in >> name; 
560                     string rest = m->getline(in);
561                     
562                     if (name != "") {
563                         numNamesInFile++;
564                         map<string, int>::iterator it = names.find(name);
565                         if (it == names.end()) { 
566                             names[name] = numNames; numNames++;
567                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
568                         }
569                     }
570                     m->gobble(in);
571                 }
572                 in.close();
573                 out.close();
574                 
575                 int numLeft = names.size();
576                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
577                 
578                 int size = 1000; //assume that user can hold 1000 seqs in memory
579                 if (numLeft < size) { size = numLeft; }
580                 int times = 0;
581                 
582                 vector<string> seqs; seqs.resize(size, "");
583                 
584                 while (numLeft > 0) {
585                     
586                     ifstream in2;
587                     m->openInputFile(flowfile, in2); in2 >> numFlows; m->gobble(in2);
588                     
589                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
590                     
591                     int found = 0;
592                     int needToFind = size;
593                     if (numLeft < size) { needToFind = numLeft; }
594                     
595                     while(!in2.eof()){
596                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
597                         
598                         //stop reading if we already found the seqs we are looking for
599                         if (found >= needToFind) { break; }
600                         
601                         in2 >> name;    
602                         string rest = m->getline(in2);
603                         
604                         if (name != "") {
605                             map<string, int>::iterator it = names.find(name);
606                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
607                                 //is it in the set of seqs we are looking for this time around
608                                 int thisSeqsPlace = it->second;
609                                 thisSeqsPlace -= (times * size);
610                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
611                                     seqs[thisSeqsPlace] = (name +'\t' + rest); 
612                                     found++;
613                                 }
614                             }else { m->mothurOut("[ERROR]: in logic of readFlow function.\n"); m->control_pressed = true; }
615                         }
616                         m->gobble(in2);
617                     }
618                     in2.close();        
619                     
620                     ofstream out2;
621                     m->openOutputFileAppend(outputFileName, out2);
622                     
623                     int output = seqs.size();
624                     if (numLeft < seqs.size()) { output = numLeft; }
625                     
626                     for (int i = 0; i < output; i++) {
627                         if (seqs[i] != "") {
628                             out2 << seqs[i] << endl;
629                         }
630                     }
631                     out2.close();
632                     
633                     times++;
634                     numLeft -= output;
635                 }
636                 
637                 m->mothurOut("Ordered " + toString(numNamesInFile) + " flows from " + flowfile + ".\n");
638             }else {
639                 
640                 vector<string> seqs; seqs.resize(names.size(), "");
641                 
642                 while(!in.eof()){
643                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
644                     
645                     in >> name; 
646                     string rest = m->getline(in);
647                     
648                     if (name != "") {
649                         map<string, int>::iterator it = names.find(name);
650                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
651                             seqs[it->second] = (name + '\t' + rest);  
652                         }else { //if we cant find it then add it to the end
653                             names[name] = seqs.size();
654                             seqs.push_back((name + '\t' + rest));
655                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
656                         }
657                     }
658                     m->gobble(in);
659                 }
660                 in.close();     
661                 
662                 int count = 0;
663                 for (int i = 0; i < seqs.size(); i++) {
664                     if (seqs[i] != "") {
665                         out << seqs[i] << endl;
666                         count++;
667                     }
668                 }
669                 out.close();
670                 
671                 m->mothurOut("Ordered " + toString(count) + " flows from " + flowfile + ".\n");
672             }
673             
674         }else { //read in file to fill names
675             int count = 0;
676             
677             while(!in.eof()){
678                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
679                 
680                 in >> name;     
681                 string rest = m->getline(in);
682                 
683                 if (name != "") {
684                     //if this name is in the accnos file
685                     names[name] = count;
686                     count++;
687                     out << name << '\t' << rest << endl;
688                 }
689                 m->gobble(in);
690             }
691             in.close(); 
692             out.close();
693             
694             m->mothurOut("\nUsing " + flowfile + " to determine the order. It contains " + toString(count) + " flows.\n");
695         }
696         
697                 return 0;
698                 
699         }
700         catch(exception& e) {
701                 m->errorOut(e, "SortSeqsCommand", "readFlow");
702                 exit(1);
703         }
704 }
705
706 //**********************************************************************************************************************
707 int SortSeqsCommand::readQual(){
708         try {
709                 string thisOutputDir = outputDir;
710                 if (outputDir == "") {  thisOutputDir += m->hasPath(qualfile);  }
711                 map<string, string> variables; 
712         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(qualfile));
713         variables["[extension]"] = m->getExtension(qualfile);
714                 string outputFileName = getOutputFileName("qfile", variables);
715         outputTypes["qfile"].push_back(outputFileName);  outputNames.push_back(outputFileName);
716         
717                 ofstream out;
718                 m->openOutputFile(outputFileName, out);
719                 
720                 ifstream in;
721                 m->openInputFile(qualfile, in);
722                 string name;
723                 
724         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
725             
726             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
727                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
728                 //this way we only store 1000 seqs in memory at a time.
729                 
730                 int numNames = names.size();
731                 int numNamesInFile = 0;
732                 
733                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
734                 while(!in.eof()){
735                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
736                     
737                     QualityScores currQual;
738                     currQual = QualityScores(in); 
739                     name = currQual.getName();
740                     
741                     if (name != "") {
742                         numNamesInFile++;
743                         map<string, int>::iterator it = names.find(name);
744                         if (it == names.end()) { 
745                             names[name] = numNames; numNames++;
746                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
747                         }
748                     }
749                     m->gobble(in);
750                 }
751                 in.close();
752                 out.close();
753                 
754                 int numLeft = names.size();
755                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
756                 
757                 int size = 1000; //assume that user can hold 1000 seqs in memory
758                 if (numLeft < size) { size = numLeft; }
759                 int times = 0;
760
761                 
762                 vector<QualityScores> seqs; seqs.resize(size);
763                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
764                 
765                 while (numLeft > 0) {
766                     
767                     ifstream in2;
768                     m->openInputFile(qualfile, in2);
769                     
770                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
771                     
772                     int found = 0;
773                     int needToFind = size;
774                     if (numLeft < size) { needToFind = numLeft; }
775                     
776                     while(!in2.eof()){
777                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
778                         
779                         //stop reading if we already found the seqs we are looking for
780                         if (found >= needToFind) { break; }
781                         
782                         QualityScores currQual;
783                         currQual = QualityScores(in2); 
784                         name = currQual.getName();
785                         
786                         if (name != "") {
787                             map<string, int>::iterator it = names.find(name);
788                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
789                                 //is it in the set of seqs we are looking for this time around
790                                 int thisSeqsPlace = it->second;
791                                 thisSeqsPlace -= (times * size);
792                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
793                                     seqs[thisSeqsPlace] = currQual; 
794                                     found++;
795                                 }
796                             }else { m->mothurOut("[ERROR]: in logic of readQual function.\n"); m->control_pressed = true; }
797                         }
798                         m->gobble(in2);
799                     }
800                     in2.close();        
801                     
802                     ofstream out2;
803                     m->openOutputFileAppend(outputFileName, out2);
804                     
805                     int output = seqs.size();
806                     if (numLeft < seqs.size()) { output = numLeft; }
807                     
808                     for (int i = 0; i < output; i++) {
809                         if (seqs[i].getName() != "") {
810                             seqs[i].printQScores(out2);
811                         }
812                     }
813                     out2.close();
814                     
815                     times++;
816                     numLeft -= output;
817                 }
818                 
819                  m->mothurOut("Ordered " + toString(numNamesInFile) + " sequences from " + qualfile + ".\n");
820                 
821             }else {
822                 
823                 vector<QualityScores> seqs; seqs.resize(names.size());
824                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
825                 
826                 while(!in.eof()){
827                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
828                     
829                     QualityScores currQual;
830                     currQual = QualityScores(in); 
831                     name = currQual.getName();
832                     
833                     if (name != "") {
834                         map<string, int>::iterator it = names.find(name);
835                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
836                             seqs[it->second] = currQual;  
837                         }else { //if we cant find it then add it to the end
838                             names[name] = seqs.size();
839                             seqs.push_back(currQual);
840                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
841                         }
842                     }
843                     m->gobble(in);
844                 }
845                 in.close();     
846                 
847                 int count = 0;
848                 for (int i = 0; i < seqs.size(); i++) {
849                     if (seqs[i].getName() != "") { seqs[i].printQScores(out); count++; }
850                 }
851                 out.close();
852                 
853                 m->mothurOut("Ordered " + toString(count) + " sequences from " + qualfile + ".\n");
854             }
855             
856         }else { //read in file to fill names
857             int count = 0;
858             
859             while(!in.eof()){
860                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
861                 
862                 QualityScores currQual;
863                 currQual = QualityScores(in);  
864                                
865                 m->gobble(in);
866                 
867                 if (currQual.getName() != "") {
868                     //if this name is in the accnos file
869                     names[currQual.getName()] = count;
870                     count++;
871                     currQual.printQScores(out);
872                 }
873                 m->gobble(in);
874             }
875             in.close(); 
876             out.close();
877             
878             m->mothurOut("\nUsing " + qualfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
879         }
880                 
881                 return 0;
882                 
883         }
884         catch(exception& e) {
885                 m->errorOut(e, "SortSeqsCommand", "readQual");
886                 exit(1);
887         }
888 }
889 //**********************************************************************************************************************
890 int SortSeqsCommand::readName(){
891         try {
892                 string thisOutputDir = outputDir;
893                 if (outputDir == "") {  thisOutputDir += m->hasPath(namefile);  }
894         map<string, string> variables; 
895                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile));
896         variables["[extension]"] = m->getExtension(namefile);
897                 string outputFileName = getOutputFileName("name", variables);
898         outputTypes["name"].push_back(outputFileName);  outputNames.push_back(outputFileName);
899         
900                 ofstream out;
901                 m->openOutputFile(outputFileName, out);
902         
903                 ifstream in;
904                 m->openInputFile(namefile, in);
905                 string name, firstCol, secondCol;
906                 
907         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
908         
909                 vector<string> seqs; seqs.resize(names.size(), "");
910                 
911                 while(!in.eof()){
912                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
913                     
914                     in >> firstCol;             m->gobble(in);          
915                     in >> secondCol;    m->gobble(in);
916                     
917                     if (firstCol != "") {
918                         map<string, int>::iterator it = names.find(firstCol);
919                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
920                             seqs[it->second] = firstCol + '\t' + secondCol;  
921                         }else { //if we cant find it then add it to the end
922                             names[firstCol] = seqs.size();
923                             seqs.push_back((firstCol + '\t' + secondCol));
924                             m->mothurOut(firstCol + " was not in the contained the file which determined the order, adding it to the end.\n");
925                         }
926                     }
927                 }
928                 in.close();     
929                 
930                 int count = 0;
931                 for (int i = 0; i < seqs.size(); i++) {
932                     if (seqs[i] != "") { out << seqs[i] << endl; count++; }
933                 }
934                 out.close();
935                 
936                 m->mothurOut("Ordered " + toString(count) + " sequences from " + namefile + ".\n");
937             
938         }else { //read in file to fill names
939             int count = 0;
940             
941             while(!in.eof()){
942                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
943                 
944                 in >> firstCol;         m->gobble(in);          
945                 in >> secondCol;    m->gobble(in);
946                 
947                 if (firstCol != "") {
948                     //if this name is in the accnos file
949                     names[firstCol] = count;
950                     count++;
951                     out << firstCol << '\t' << secondCol << endl;
952                 }
953                 m->gobble(in);
954             }
955             in.close(); 
956             out.close();
957             
958             m->mothurOut("\nUsing " + namefile + " to determine the order. It contains " + toString(count) + " representative sequences.\n");
959         }
960                                 
961                 return 0;
962         }
963         catch(exception& e) {
964                 m->errorOut(e, "SortSeqsCommand", "readName");
965                 exit(1);
966         }
967 }
968 //**********************************************************************************************************************
969 int SortSeqsCommand::readCount(){
970         try {
971                 string thisOutputDir = outputDir;
972                 if (outputDir == "") {  thisOutputDir += m->hasPath(countfile);  }
973         map<string, string> variables; 
974                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(countfile));
975         variables["[extension]"] = m->getExtension(countfile);
976                 string outputFileName = getOutputFileName("count", variables);
977         outputTypes["count"].push_back(outputFileName);  outputNames.push_back(outputFileName);
978         
979                 ofstream out;
980                 m->openOutputFile(outputFileName, out);
981         
982                 ifstream in;
983                 m->openInputFile(countfile, in);
984                 string firstCol, rest;
985                 
986         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
987             
988             vector<string> seqs; seqs.resize(names.size(), "");
989             
990             string headers = m->getline(in); m->gobble(in);
991             
992             while(!in.eof()){
993                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
994                 
995                 in >> firstCol;         m->gobble(in);          
996                 rest = m->getline(in);    m->gobble(in);
997                 
998                 if (firstCol != "") {
999                     map<string, int>::iterator it = names.find(firstCol);
1000                     if (it != names.end()) { //we found it, so put it in the vector in the right place.
1001                         seqs[it->second] = firstCol + '\t' + rest;  
1002                     }else { //if we cant find it then add it to the end
1003                         names[firstCol] = seqs.size();
1004                         seqs.push_back((firstCol + '\t' + rest));
1005                         m->mothurOut(firstCol + " was not in the contained the file which determined the order, adding it to the end.\n");
1006                     }
1007                 }
1008             }
1009             in.close(); 
1010             
1011             int count = 0;
1012             out << headers << endl;
1013             for (int i = 0; i < seqs.size(); i++) {
1014                 if (seqs[i] != "") { out << seqs[i] << endl; count++; }
1015             }
1016             out.close();
1017             
1018             m->mothurOut("Ordered " + toString(count) + " sequences from " + countfile + ".\n");
1019             
1020         }else { //read in file to fill names
1021             int count = 0;
1022             
1023             string headers = m->getline(in); m->gobble(in);
1024             out << headers << endl;
1025             
1026             while(!in.eof()){
1027                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1028                 
1029                 in >> firstCol;         m->gobble(in);          
1030                 rest = m->getline(in);  m->gobble(in);
1031                 
1032                 if (firstCol != "") {
1033                     //if this name is in the accnos file
1034                     names[firstCol] = count;
1035                     count++;
1036                     out << firstCol << '\t' << rest << endl;
1037                 }
1038                 m->gobble(in);
1039             }
1040             in.close(); 
1041             out.close();
1042             
1043             m->mothurOut("\nUsing " + countfile + " to determine the order. It contains " + toString(count) + " representative sequences.\n");
1044         }
1045         
1046                 return 0;
1047         }
1048         catch(exception& e) {
1049                 m->errorOut(e, "SortSeqsCommand", "readCount");
1050                 exit(1);
1051         }
1052 }
1053 //**********************************************************************************************************************
1054 int SortSeqsCommand::readGroup(){
1055         try {
1056                 string thisOutputDir = outputDir;
1057                 if (outputDir == "") {  thisOutputDir += m->hasPath(groupfile);  }
1058                 map<string, string> variables; 
1059                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
1060         variables["[extension]"] = m->getExtension(groupfile);
1061                 string outputFileName = getOutputFileName("group", variables);
1062         outputTypes["group"].push_back(outputFileName);  outputNames.push_back(outputFileName);
1063         
1064                 ofstream out;
1065                 m->openOutputFile(outputFileName, out);
1066         
1067                 ifstream in;
1068                 m->openInputFile(groupfile, in);
1069                 string name, group;
1070                 
1071                 if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
1072             
1073             vector<string> seqs; seqs.resize(names.size(), "");
1074             
1075             while(!in.eof()){
1076                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1077                 
1078                 in >> name;             m->gobble(in);          
1079                 in >> group;    m->gobble(in);
1080                 
1081                 if (name != "") {
1082                     map<string, int>::iterator it = names.find(name);
1083                     if (it != names.end()) { //we found it, so put it in the vector in the right place.
1084                         seqs[it->second] = name + '\t' + group;  
1085                     }else { //if we cant find it then add it to the end
1086                         names[name] = seqs.size();
1087                         seqs.push_back((name + '\t' + group));
1088                         m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
1089                     }
1090                 }
1091             }
1092             in.close(); 
1093             
1094             int count = 0;
1095             for (int i = 0; i < seqs.size(); i++) {
1096                 if (seqs[i] != "") { out << seqs[i] << endl; count++; }
1097             }
1098             out.close();
1099             
1100             m->mothurOut("Ordered " + toString(count) + " sequences from " + groupfile + ".\n");
1101             
1102         }else { //read in file to fill names
1103             int count = 0;
1104             
1105             while(!in.eof()){
1106                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1107                 
1108                 in >> name;             m->gobble(in);          
1109                 in >> group;    m->gobble(in);
1110                 
1111                 if (name != "") {
1112                     //if this name is in the accnos file
1113                     names[name] = count;
1114                     count++;
1115                     out << name << '\t' << group << endl;
1116                 }
1117                 m->gobble(in);
1118             }
1119             in.close(); 
1120             out.close();
1121             
1122             m->mothurOut("\nUsing " + groupfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
1123         }
1124         
1125                 return 0;
1126         }
1127         catch(exception& e) {
1128                 m->errorOut(e, "SortSeqsCommand", "readGroup");
1129                 exit(1);
1130         }
1131 }
1132 //**********************************************************************************************************************
1133 int SortSeqsCommand::readTax(){
1134         try {
1135                 string thisOutputDir = outputDir;
1136                 if (outputDir == "") {  thisOutputDir += m->hasPath(taxfile);  }
1137                 map<string, string> variables; 
1138                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile));
1139         variables["[extension]"] = m->getExtension(taxfile);
1140                 string outputFileName = getOutputFileName("taxonomy", variables);
1141
1142         outputTypes["taxonomy"].push_back(outputFileName);  outputNames.push_back(outputFileName);
1143         
1144                 ofstream out;
1145                 m->openOutputFile(outputFileName, out);
1146         
1147                 ifstream in;
1148                 m->openInputFile(taxfile, in);
1149                 string name, tax;
1150                 
1151                 if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
1152             
1153             vector<string> seqs; seqs.resize(names.size(), "");
1154             
1155             while(!in.eof()){
1156                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1157                 
1158                 in >> name;             m->gobble(in);          
1159                 in >> tax;    m->gobble(in);
1160                 
1161                 if (name != "") {
1162                     map<string, int>::iterator it = names.find(name);
1163                     if (it != names.end()) { //we found it, so put it in the vector in the right place.
1164                         seqs[it->second] = name + '\t' + tax;  
1165                     }else { //if we cant find it then add it to the end
1166                         names[name] = seqs.size();
1167                         seqs.push_back((name + '\t' + tax));
1168                         m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
1169                     }
1170                 }
1171             }
1172             in.close(); 
1173             
1174             int count = 0;
1175             for (int i = 0; i < seqs.size(); i++) {
1176                 if (seqs[i] != "") { out << seqs[i] << endl; count++; }
1177             }
1178             out.close();
1179             
1180             m->mothurOut("Ordered " + toString(count) + " sequences from " + taxfile + ".\n");
1181             
1182         }else { //read in file to fill names
1183             int count = 0;
1184             
1185             while(!in.eof()){
1186                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1187                 
1188                 in >> name;             m->gobble(in);          
1189                 in >> tax;    m->gobble(in);
1190                 
1191                 if (name != "") {
1192                     //if this name is in the accnos file
1193                     names[name] = count;
1194                     count++;
1195                     out << name << '\t' << tax << endl;
1196                 }
1197                 m->gobble(in);
1198             }
1199             in.close(); 
1200             out.close();
1201             
1202             m->mothurOut("\nUsing " + taxfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
1203         }
1204         
1205                 return 0;
1206                 return 0;
1207         }
1208         catch(exception& e) {
1209                 m->errorOut(e, "SortSeqsCommand", "readTax");
1210                 exit(1);
1211         }
1212 }
1213 //**********************************************************************************************************************
1214
1215
1216
1217
1218