]> git.donarmstrong.com Git - mothur.git/blob - sortseqscommand.cpp
added load.logfile command. changed summary.single output for subsample=t.
[mothur.git] / sortseqscommand.cpp
1 //
2 //  sortseqscommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 2/3/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "sortseqscommand.h"
10 #include "sequence.hpp"
11 #include "qualityscores.h"
12
13 //**********************************************************************************************************************
14 vector<string> SortSeqsCommand::setParameters(){        
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
17         CommandParameter pflow("flow", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pflow);
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup);
20                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy);
21                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pqfile);
22                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
23                 CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos);
24         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "SortSeqsCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string SortSeqsCommand::getHelpString(){        
38         try {
39                 string helpString = "";
40                 helpString += "The sort.seqs command puts the sequences in the same order for the following file types: accnos fasta, name, group, taxonomy, flow or quality file.\n";
41         helpString += "The sort.seqs command parameters are accnos, fasta, name, group, taxonomy, flow, qfile and large.\n";
42         helpString += "The accnos file allows you to specify the order you want the files in.  If none is provided, mothur will use the order of the first file it reads.\n";
43         helpString += "The large parameters is used to indicate your files are too large to fit in RAM.\n";
44                 helpString += "The sort.seqs command should be in the following format: sort.seqs(fasta=yourFasta).\n";
45                 helpString += "Example sort.seqs(fasta=amazon.fasta).\n";
46                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
47                 return helpString;
48         }
49         catch(exception& e) {
50                 m->errorOut(e, "SortSeqsCommand", "getHelpString");
51                 exit(1);
52         }
53 }
54
55 //**********************************************************************************************************************
56 string SortSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ 
57         try {
58         string outputFileName = "";
59                 map<string, vector<string> >::iterator it;
60         
61         //is this a type this command creates
62         it = outputTypes.find(type);
63         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
64         else {
65             if (type == "fasta")            {   outputFileName =  "sorted" + m->getExtension(inputName);   }
66             else if (type == "taxonomy")    {   outputFileName =  "sorted" + m->getExtension(inputName);   }
67             else if (type == "name")        {   outputFileName =  "sorted" + m->getExtension(inputName);   }
68             else if (type == "group")       {   outputFileName =  "sorted" + m->getExtension(inputName);   }
69             else if (type == "flow")        {   outputFileName =  "sorted" + m->getExtension(inputName);   }
70             else if (type == "qfile")       {   outputFileName =  "sorted" + m->getExtension(inputName);   }
71             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
72         }
73         return outputFileName;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "SortSeqsCommand", "getOutputFileNameTag");
77                 exit(1);
78         }
79 }
80
81 //**********************************************************************************************************************
82 SortSeqsCommand::SortSeqsCommand(){     
83         try {
84                 abort = true; calledHelp = true; 
85                 setParameters();
86                 vector<string> tempOutNames;
87                 outputTypes["fasta"] = tempOutNames;
88                 outputTypes["taxonomy"] = tempOutNames;
89                 outputTypes["name"] = tempOutNames;
90                 outputTypes["group"] = tempOutNames;
91                 outputTypes["qfile"] = tempOutNames;
92         outputTypes["flow"] = tempOutNames;
93         }
94         catch(exception& e) {
95                 m->errorOut(e, "SortSeqsCommand", "SortSeqsCommand");
96                 exit(1);
97         }
98 }
99 //**********************************************************************************************************************
100 SortSeqsCommand::SortSeqsCommand(string option)  {
101         try {
102                 abort = false; calledHelp = false;   
103                 
104                 //allow user to run help
105                 if(option == "help") { help(); abort = true; calledHelp = true; }
106                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
107                 
108                 else {
109                         vector<string> myArray = setParameters();
110                         
111                         OptionParser parser(option);
112                         map<string,string> parameters = parser.getParameters();
113                         
114                         ValidParameters validParameter;
115                         map<string,string>::iterator it;
116                         
117                         //check to make sure all parameters are valid for command
118                         for (it = parameters.begin(); it != parameters.end(); it++) { 
119                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
120                         }
121                         
122                         //initialize outputTypes
123                         vector<string> tempOutNames;
124                         outputTypes["fasta"] = tempOutNames;
125                         outputTypes["taxonomy"] = tempOutNames;
126                         outputTypes["name"] = tempOutNames;
127                         outputTypes["group"] = tempOutNames;
128                         outputTypes["qfile"] = tempOutNames;
129             outputTypes["flow"] = tempOutNames;
130                         
131                         //if the user changes the output directory command factory will send this info to us in the output parameter 
132                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("fasta");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
145                                 }
146                                 
147                                 it = parameters.find("name");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
153                                 }
154                                 
155                                 it = parameters.find("group");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
161                                 }
162                                 
163                                 it = parameters.find("taxonomy");
164                                 //user has given a template file
165                                 if(it != parameters.end()){ 
166                                         path = m->hasPath(it->second);
167                                         //if the user has not given a path then, add inputdir. else leave path alone.
168                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
169                                 }
170                                 
171                                 it = parameters.find("qfile");
172                                 //user has given a template file
173                                 if(it != parameters.end()){ 
174                                         path = m->hasPath(it->second);
175                                         //if the user has not given a path then, add inputdir. else leave path alone.
176                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
177                                 }
178                 
179                 it = parameters.find("accnos");
180                                 //user has given a template file
181                                 if(it != parameters.end()){ 
182                                         path = m->hasPath(it->second);
183                                         //if the user has not given a path then, add inputdir. else leave path alone.
184                                         if (path == "") {       parameters["accnos"] = inputDir + it->second;           }
185                                 }
186                 
187                 it = parameters.find("flow");
188                                 //user has given a template file
189                                 if(it != parameters.end()){ 
190                                         path = m->hasPath(it->second);
191                                         //if the user has not given a path then, add inputdir. else leave path alone.
192                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
193                                 }
194                         }
195             
196                         
197                         //check for parameters
198             accnosfile = validParameter.validFile(parameters, "accnos", true);
199                         if (accnosfile == "not open") { accnosfile = ""; abort = true; }
200                         else if (accnosfile == "not found") {  accnosfile = "";  }      
201                         else { m->setAccnosFile(accnosfile); }
202             
203                         fastafile = validParameter.validFile(parameters, "fasta", true);
204                         if (fastafile == "not open") { fastafile = ""; abort = true; }
205                         else if (fastafile == "not found") {  fastafile = "";  }        
206                         else { m->setFastaFile(fastafile); }
207             
208             flowfile = validParameter.validFile(parameters, "flow", true);
209                         if (flowfile == "not open") { flowfile = ""; abort = true; }
210                         else if (flowfile == "not found") {  flowfile = "";  }  
211                         else { m->setFlowFile(flowfile); }
212             
213                         namefile = validParameter.validFile(parameters, "name", true);
214                         if (namefile == "not open") { namefile = ""; abort = true; }
215                         else if (namefile == "not found") {  namefile = "";  }  
216                         else { m->setNameFile(namefile); } 
217             
218                         groupfile = validParameter.validFile(parameters, "group", true);
219                         if (groupfile == "not open") { abort = true; }
220                         else if (groupfile == "not found") {  groupfile = "";  }
221                         else { m->setGroupFile(groupfile); }
222                         
223                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
224                         if (taxfile == "not open") { abort = true; }
225                         else if (taxfile == "not found") {  taxfile = "";  }
226                         else { m->setTaxonomyFile(taxfile); }
227                         
228                         qualfile = validParameter.validFile(parameters, "qfile", true);
229                         if (qualfile == "not open") { abort = true; }
230                         else if (qualfile == "not found") {  qualfile = "";  }                  
231                         else { m->setQualFile(qualfile); }
232                         
233             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") { temp = "f"; }
234                         large = m->isTrue(temp);
235             
236                         if ((fastafile == "") && (namefile == "") && (groupfile == "") && (taxfile == "") && (flowfile == "") && (qualfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, flow or quality."); m->mothurOutEndLine(); abort = true; }
237                         
238                         if ((fastafile != "") && (namefile == "")) {
239                                 vector<string> files; files.push_back(fastafile);
240                                 parser.getNameFile(files);
241                         }
242                 }
243         
244         }
245         catch(exception& e) {
246                 m->errorOut(e, "SortSeqsCommand", "SortSeqsCommand");
247                 exit(1);
248         }
249 }
250 //**********************************************************************************************************************
251
252 int SortSeqsCommand::execute(){
253         try {
254                 
255                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
256                 
257                 //read through the correct file and output lines you want to keep
258         if (accnosfile != "")           {               
259             vector<string> temp;
260             m->readAccnos(accnosfile, temp);
261             for (int i = 0; i < temp.size(); i++) {  names[temp[i]] = i;  }
262             m->mothurOut("\nUsing " + accnosfile + " to determine the order. It contains " + toString(temp.size()) + " representative sequences.\n");   
263         }
264         
265                 if (fastafile != "")            {               readFasta();    }
266         if (flowfile != "")         {           readFlow();     }
267         if (qualfile != "")                     {               readQual();             }
268         if (namefile != "")                     {               readName();             }
269                 if (groupfile != "")            {               readGroup();    }
270         if (taxfile != "")                      {               readTax();              }
271                 
272                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
273         
274                 if (outputNames.size() != 0) {
275                         m->mothurOutEndLine();
276                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
277                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
278                         m->mothurOutEndLine();
279                         
280                         //set fasta file as new current fastafile
281                         string current = "";
282                         itTypes = outputTypes.find("fasta");
283                         if (itTypes != outputTypes.end()) {
284                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
285                         }
286                         
287                         itTypes = outputTypes.find("name");
288                         if (itTypes != outputTypes.end()) {
289                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
290                         }
291                         
292                         itTypes = outputTypes.find("group");
293                         if (itTypes != outputTypes.end()) {
294                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
295                         }
296                         
297                         
298                         itTypes = outputTypes.find("taxonomy");
299                         if (itTypes != outputTypes.end()) {
300                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
301                         }
302                         
303                         itTypes = outputTypes.find("qfile");
304                         if (itTypes != outputTypes.end()) {
305                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
306                         }       
307             
308             itTypes = outputTypes.find("flow");
309                         if (itTypes != outputTypes.end()) {
310                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFlowFile(current); }
311                         }       
312                 }
313                 
314                 return 0;               
315         }
316     
317         catch(exception& e) {
318                 m->errorOut(e, "SortSeqsCommand", "execute");
319                 exit(1);
320         }
321 }
322
323 //**********************************************************************************************************************
324 int SortSeqsCommand::readFasta(){
325         try {
326                 string thisOutputDir = outputDir;
327                 if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
328                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta", fastafile);
329                 outputTypes["fasta"].push_back(outputFileName);  outputNames.push_back(outputFileName);
330         
331                 ofstream out;
332                 m->openOutputFile(outputFileName, out);
333                 
334                 ifstream in;
335                 m->openInputFile(fastafile, in);
336                 string name;
337                 
338         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
339             
340             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
341                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
342                 //this way we only store 1000 seqs in memory at a time.
343                 
344                 int numNames = names.size();
345                 int numNamesInFile = 0;
346                 
347                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
348                 while(!in.eof()){
349                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
350                     
351                     Sequence currSeq(in);
352                     name = currSeq.getName();
353                     
354                     if (name != "") {
355                         numNamesInFile++;
356                         map<string, int>::iterator it = names.find(name);
357                         if (it == names.end()) { 
358                             names[name] = numNames; numNames++;
359                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
360                         }
361                     }
362                     m->gobble(in);
363                 }
364                 in.close();
365                 out.close();
366                 
367                 int numLeft = names.size();
368                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
369                 
370                 int size = 1000; //assume that user can hold 1000 seqs in memory
371                 if (numLeft < size) { size = numLeft; }
372                 int times = 0;
373                 
374                 vector<Sequence> seqs; seqs.resize(size);
375                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
376                 
377                 while (numLeft > 0) {
378                     
379                     ifstream in2;
380                     m->openInputFile(fastafile, in2);
381                     
382                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
383                     
384                     int found = 0;
385                     int needToFind = size;
386                     if (numLeft < size) { needToFind = numLeft; }
387                     
388                     while(!in2.eof()){
389                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
390                         
391                         //stop reading if we already found the seqs we are looking for
392                         if (found >= needToFind) { break; }
393                         
394                         Sequence currSeq(in2);
395                         name = currSeq.getName();
396                         
397                         if (name != "") {
398                             map<string, int>::iterator it = names.find(name);
399                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
400                                 //is it in the set of seqs we are looking for this time around
401                                 int thisSeqsPlace = it->second;
402                                 thisSeqsPlace -= (times * size);
403                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
404                                     seqs[thisSeqsPlace] = currSeq; 
405                                     found++;
406                                 }
407                             }else { m->mothurOut("[ERROR]: in logic of readFasta function.\n"); m->control_pressed = true; }
408                         }
409                         m->gobble(in2);
410                     }
411                     in2.close();        
412
413                     ofstream out2;
414                     m->openOutputFileAppend(outputFileName, out2);
415                     
416                     int output = seqs.size();
417                     if (numLeft < seqs.size()) { output = numLeft; }
418                         
419                     for (int i = 0; i < output; i++) {
420                         if (seqs[i].getName() != "") { seqs[i].printSequence(out2); }
421                     }
422                     out2.close();
423                     
424                     times++;
425                     numLeft -= output;
426                 }
427                 
428                 m->mothurOut("Ordered " + toString(numNamesInFile) + " sequences from " + fastafile + ".\n");
429             }else {
430                 
431                 vector<Sequence> seqs; seqs.resize(names.size());
432                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
433                 
434                 while(!in.eof()){
435                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
436                     
437                     Sequence currSeq(in);
438                     name = currSeq.getName();
439                     
440                     if (name != "") {
441                         map<string, int>::iterator it = names.find(name);
442                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
443                             seqs[it->second] = currSeq;  
444                         }else { //if we cant find it then add it to the end
445                             names[name] = seqs.size();
446                             seqs.push_back(currSeq);
447                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
448                         }
449                     }
450                     m->gobble(in);
451                 }
452                 in.close();     
453                 
454                 int count = 0;
455                 for (int i = 0; i < seqs.size(); i++) {
456                     if (seqs[i].getName() != "") {
457                         seqs[i].printSequence(out); count++;
458                     }
459                 }
460                 out.close();
461                 
462                 m->mothurOut("Ordered " + toString(count) + " sequences from " + fastafile + ".\n");
463             }
464                         
465         }else { //read in file to fill names
466             int count = 0;
467             
468             while(!in.eof()){
469                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
470                 
471                 Sequence currSeq(in);
472                 name = currSeq.getName();
473                 
474                 if (name != "") {
475                     //if this name is in the accnos file
476                     names[name] = count;
477                     count++;
478                     currSeq.printSequence(out);
479                 }
480                 m->gobble(in);
481             }
482             in.close(); 
483             out.close();
484             
485             m->mothurOut("\nUsing " + fastafile + " to determine the order. It contains " + toString(count) + " sequences.\n");
486         }
487                                 
488                 return 0;
489                 
490         }
491         catch(exception& e) {
492                 m->errorOut(e, "SortSeqsCommand", "readFasta");
493                 exit(1);
494         }
495 }
496 //**********************************************************************************************************************
497 int SortSeqsCommand::readFlow(){
498         try {
499                 string thisOutputDir = outputDir;
500                 if (outputDir == "") {  thisOutputDir += m->hasPath(flowfile);  }
501                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowfile)) + getOutputFileNameTag("flow", flowfile);
502                 outputTypes["flow"].push_back(outputFileName);  outputNames.push_back(outputFileName);
503         
504                 ofstream out;
505                 m->openOutputFile(outputFileName, out);
506                 
507                 ifstream in;
508                 m->openInputFile(flowfile, in);
509         int numFlows;
510                 string name;
511         
512         in >> numFlows; m->gobble(in);
513                 
514         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
515             
516             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
517                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
518                 //this way we only store 1000 seqs in memory at a time.
519                 
520                 int numNames = names.size();
521                 int numNamesInFile = 0;
522                 
523                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
524                 while(!in.eof()){
525                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
526                     
527                     in >> name; 
528                     string rest = m->getline(in);
529                     
530                     if (name != "") {
531                         numNamesInFile++;
532                         map<string, int>::iterator it = names.find(name);
533                         if (it == names.end()) { 
534                             names[name] = numNames; numNames++;
535                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
536                         }
537                     }
538                     m->gobble(in);
539                 }
540                 in.close();
541                 out.close();
542                 
543                 int numLeft = names.size();
544                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
545                 
546                 int size = 1000; //assume that user can hold 1000 seqs in memory
547                 if (numLeft < size) { size = numLeft; }
548                 int times = 0;
549                 
550                 vector<string> seqs; seqs.resize(size, "");
551                 
552                 while (numLeft > 0) {
553                     
554                     ifstream in2;
555                     m->openInputFile(flowfile, in2); in2 >> numFlows; m->gobble(in2);
556                     
557                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
558                     
559                     int found = 0;
560                     int needToFind = size;
561                     if (numLeft < size) { needToFind = numLeft; }
562                     
563                     while(!in2.eof()){
564                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
565                         
566                         //stop reading if we already found the seqs we are looking for
567                         if (found >= needToFind) { break; }
568                         
569                         in2 >> name;    
570                         string rest = m->getline(in2);
571                         
572                         if (name != "") {
573                             map<string, int>::iterator it = names.find(name);
574                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
575                                 //is it in the set of seqs we are looking for this time around
576                                 int thisSeqsPlace = it->second;
577                                 thisSeqsPlace -= (times * size);
578                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
579                                     seqs[thisSeqsPlace] = (name +'\t' + rest); 
580                                     found++;
581                                 }
582                             }else { m->mothurOut("[ERROR]: in logic of readFlow function.\n"); m->control_pressed = true; }
583                         }
584                         m->gobble(in2);
585                     }
586                     in2.close();        
587                     
588                     ofstream out2;
589                     m->openOutputFileAppend(outputFileName, out2);
590                     
591                     int output = seqs.size();
592                     if (numLeft < seqs.size()) { output = numLeft; }
593                     
594                     for (int i = 0; i < output; i++) {
595                         if (seqs[i] != "") {
596                             out2 << seqs[i] << endl;
597                         }
598                     }
599                     out2.close();
600                     
601                     times++;
602                     numLeft -= output;
603                 }
604                 
605                 m->mothurOut("Ordered " + toString(numNamesInFile) + " flows from " + flowfile + ".\n");
606             }else {
607                 
608                 vector<string> seqs; seqs.resize(names.size(), "");
609                 
610                 while(!in.eof()){
611                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
612                     
613                     in >> name; 
614                     string rest = m->getline(in);
615                     
616                     if (name != "") {
617                         map<string, int>::iterator it = names.find(name);
618                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
619                             seqs[it->second] = (name + '\t' + rest);  
620                         }else { //if we cant find it then add it to the end
621                             names[name] = seqs.size();
622                             seqs.push_back((name + '\t' + rest));
623                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
624                         }
625                     }
626                     m->gobble(in);
627                 }
628                 in.close();     
629                 
630                 int count = 0;
631                 for (int i = 0; i < seqs.size(); i++) {
632                     if (seqs[i] != "") {
633                         out << seqs[i] << endl;
634                         count++;
635                     }
636                 }
637                 out.close();
638                 
639                 m->mothurOut("Ordered " + toString(count) + " flows from " + flowfile + ".\n");
640             }
641             
642         }else { //read in file to fill names
643             int count = 0;
644             
645             while(!in.eof()){
646                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
647                 
648                 in >> name;     
649                 string rest = m->getline(in);
650                 
651                 if (name != "") {
652                     //if this name is in the accnos file
653                     names[name] = count;
654                     count++;
655                     out << name << '\t' << rest << endl;
656                 }
657                 m->gobble(in);
658             }
659             in.close(); 
660             out.close();
661             
662             m->mothurOut("\nUsing " + flowfile + " to determine the order. It contains " + toString(count) + " flows.\n");
663         }
664         
665                 return 0;
666                 
667         }
668         catch(exception& e) {
669                 m->errorOut(e, "SortSeqsCommand", "readFlow");
670                 exit(1);
671         }
672 }
673
674 //**********************************************************************************************************************
675 int SortSeqsCommand::readQual(){
676         try {
677                 string thisOutputDir = outputDir;
678                 if (outputDir == "") {  thisOutputDir += m->hasPath(qualfile);  }
679                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(qualfile)) + getOutputFileNameTag("qfile", qualfile);
680         outputTypes["qfile"].push_back(outputFileName);  outputNames.push_back(outputFileName);
681         
682                 ofstream out;
683                 m->openOutputFile(outputFileName, out);
684                 
685                 ifstream in;
686                 m->openInputFile(qualfile, in);
687                 string name;
688                 
689         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
690             
691             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
692                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
693                 //this way we only store 1000 seqs in memory at a time.
694                 
695                 int numNames = names.size();
696                 int numNamesInFile = 0;
697                 
698                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
699                 while(!in.eof()){
700                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
701                     
702                     QualityScores currQual;
703                     currQual = QualityScores(in); 
704                     name = currQual.getName();
705                     
706                     if (name != "") {
707                         numNamesInFile++;
708                         map<string, int>::iterator it = names.find(name);
709                         if (it == names.end()) { 
710                             names[name] = numNames; numNames++;
711                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
712                         }
713                     }
714                     m->gobble(in);
715                 }
716                 in.close();
717                 out.close();
718                 
719                 int numLeft = names.size();
720                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
721                 
722                 int size = 1000; //assume that user can hold 1000 seqs in memory
723                 if (numLeft < size) { size = numLeft; }
724                 int times = 0;
725
726                 
727                 vector<QualityScores> seqs; seqs.resize(size);
728                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
729                 
730                 while (numLeft > 0) {
731                     
732                     ifstream in2;
733                     m->openInputFile(qualfile, in2);
734                     
735                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
736                     
737                     int found = 0;
738                     int needToFind = size;
739                     if (numLeft < size) { needToFind = numLeft; }
740                     
741                     while(!in2.eof()){
742                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
743                         
744                         //stop reading if we already found the seqs we are looking for
745                         if (found >= needToFind) { break; }
746                         
747                         QualityScores currQual;
748                         currQual = QualityScores(in2); 
749                         name = currQual.getName();
750                         
751                         if (name != "") {
752                             map<string, int>::iterator it = names.find(name);
753                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
754                                 //is it in the set of seqs we are looking for this time around
755                                 int thisSeqsPlace = it->second;
756                                 thisSeqsPlace -= (times * size);
757                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
758                                     seqs[thisSeqsPlace] = currQual; 
759                                     found++;
760                                 }
761                             }else { m->mothurOut("[ERROR]: in logic of readQual function.\n"); m->control_pressed = true; }
762                         }
763                         m->gobble(in2);
764                     }
765                     in2.close();        
766                     
767                     ofstream out2;
768                     m->openOutputFileAppend(outputFileName, out2);
769                     
770                     int output = seqs.size();
771                     if (numLeft < seqs.size()) { output = numLeft; }
772                     
773                     for (int i = 0; i < output; i++) {
774                         if (seqs[i].getName() != "") {
775                             seqs[i].printQScores(out2);
776                         }
777                     }
778                     out2.close();
779                     
780                     times++;
781                     numLeft -= output;
782                 }
783                 
784                  m->mothurOut("Ordered " + toString(numNamesInFile) + " sequences from " + qualfile + ".\n");
785                 
786             }else {
787                 
788                 vector<QualityScores> seqs; seqs.resize(names.size());
789                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
790                 
791                 while(!in.eof()){
792                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
793                     
794                     QualityScores currQual;
795                     currQual = QualityScores(in); 
796                     name = currQual.getName();
797                     
798                     if (name != "") {
799                         map<string, int>::iterator it = names.find(name);
800                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
801                             seqs[it->second] = currQual;  
802                         }else { //if we cant find it then add it to the end
803                             names[name] = seqs.size();
804                             seqs.push_back(currQual);
805                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
806                         }
807                     }
808                     m->gobble(in);
809                 }
810                 in.close();     
811                 
812                 int count = 0;
813                 for (int i = 0; i < seqs.size(); i++) {
814                     if (seqs[i].getName() != "") { seqs[i].printQScores(out); count++; }
815                 }
816                 out.close();
817                 
818                 m->mothurOut("Ordered " + toString(count) + " sequences from " + qualfile + ".\n");
819             }
820             
821         }else { //read in file to fill names
822             int count = 0;
823             
824             while(!in.eof()){
825                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
826                 
827                 QualityScores currQual;
828                 currQual = QualityScores(in);  
829                                
830                 m->gobble(in);
831                 
832                 if (currQual.getName() != "") {
833                     //if this name is in the accnos file
834                     names[currQual.getName()] = count;
835                     count++;
836                     currQual.printQScores(out);
837                 }
838                 m->gobble(in);
839             }
840             in.close(); 
841             out.close();
842             
843             m->mothurOut("\nUsing " + qualfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
844         }
845                 
846                 return 0;
847                 
848         }
849         catch(exception& e) {
850                 m->errorOut(e, "SortSeqsCommand", "readQual");
851                 exit(1);
852         }
853 }
854 //**********************************************************************************************************************
855 int SortSeqsCommand::readName(){
856         try {
857                 string thisOutputDir = outputDir;
858                 if (outputDir == "") {  thisOutputDir += m->hasPath(namefile);  }
859                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile); 
860         outputTypes["name"].push_back(outputFileName);  outputNames.push_back(outputFileName);
861         
862                 ofstream out;
863                 m->openOutputFile(outputFileName, out);
864         
865                 ifstream in;
866                 m->openInputFile(namefile, in);
867                 string name, firstCol, secondCol;
868                 
869         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
870         
871                 vector<string> seqs; seqs.resize(names.size(), "");
872                 
873                 while(!in.eof()){
874                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
875                     
876                     in >> firstCol;             m->gobble(in);          
877                     in >> secondCol;    m->gobble(in);
878                     
879                     if (firstCol != "") {
880                         map<string, int>::iterator it = names.find(firstCol);
881                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
882                             seqs[it->second] = firstCol + '\t' + secondCol;  
883                         }else { //if we cant find it then add it to the end
884                             names[firstCol] = seqs.size();
885                             seqs.push_back((firstCol + '\t' + secondCol));
886                             m->mothurOut(firstCol + " was not in the contained the file which determined the order, adding it to the end.\n");
887                         }
888                     }
889                 }
890                 in.close();     
891                 
892                 int count = 0;
893                 for (int i = 0; i < seqs.size(); i++) {
894                     if (seqs[i] != "") { out << seqs[i] << endl; count++; }
895                 }
896                 out.close();
897                 
898                 m->mothurOut("Ordered " + toString(count) + " sequences from " + namefile + ".\n");
899             
900         }else { //read in file to fill names
901             int count = 0;
902             
903             while(!in.eof()){
904                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
905                 
906                 in >> firstCol;         m->gobble(in);          
907                 in >> secondCol;    m->gobble(in);
908                 
909                 if (firstCol != "") {
910                     //if this name is in the accnos file
911                     names[firstCol] = count;
912                     count++;
913                     out << firstCol << '\t' << secondCol << endl;
914                 }
915                 m->gobble(in);
916             }
917             in.close(); 
918             out.close();
919             
920             m->mothurOut("\nUsing " + namefile + " to determine the order. It contains " + toString(count) + " representative sequences.\n");
921         }
922                                 
923                 return 0;
924         }
925         catch(exception& e) {
926                 m->errorOut(e, "SortSeqsCommand", "readName");
927                 exit(1);
928         }
929 }
930
931 //**********************************************************************************************************************
932 int SortSeqsCommand::readGroup(){
933         try {
934                 string thisOutputDir = outputDir;
935                 if (outputDir == "") {  thisOutputDir += m->hasPath(groupfile);  }
936                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile); 
937         outputTypes["group"].push_back(outputFileName);  outputNames.push_back(outputFileName);
938         
939                 ofstream out;
940                 m->openOutputFile(outputFileName, out);
941         
942                 ifstream in;
943                 m->openInputFile(groupfile, in);
944                 string name, group;
945                 
946                 if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
947             
948             vector<string> seqs; seqs.resize(names.size(), "");
949             
950             while(!in.eof()){
951                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
952                 
953                 in >> name;             m->gobble(in);          
954                 in >> group;    m->gobble(in);
955                 
956                 if (name != "") {
957                     map<string, int>::iterator it = names.find(name);
958                     if (it != names.end()) { //we found it, so put it in the vector in the right place.
959                         seqs[it->second] = name + '\t' + group;  
960                     }else { //if we cant find it then add it to the end
961                         names[name] = seqs.size();
962                         seqs.push_back((name + '\t' + group));
963                         m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
964                     }
965                 }
966             }
967             in.close(); 
968             
969             int count = 0;
970             for (int i = 0; i < seqs.size(); i++) {
971                 if (seqs[i] != "") { out << seqs[i] << endl; count++; }
972             }
973             out.close();
974             
975             m->mothurOut("Ordered " + toString(count) + " sequences from " + groupfile + ".\n");
976             
977         }else { //read in file to fill names
978             int count = 0;
979             
980             while(!in.eof()){
981                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
982                 
983                 in >> name;             m->gobble(in);          
984                 in >> group;    m->gobble(in);
985                 
986                 if (name != "") {
987                     //if this name is in the accnos file
988                     names[name] = count;
989                     count++;
990                     out << name << '\t' << group << endl;
991                 }
992                 m->gobble(in);
993             }
994             in.close(); 
995             out.close();
996             
997             m->mothurOut("\nUsing " + groupfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
998         }
999         
1000                 return 0;
1001         }
1002         catch(exception& e) {
1003                 m->errorOut(e, "SortSeqsCommand", "readGroup");
1004                 exit(1);
1005         }
1006 }
1007 //**********************************************************************************************************************
1008 int SortSeqsCommand::readTax(){
1009         try {
1010                 string thisOutputDir = outputDir;
1011                 if (outputDir == "") {  thisOutputDir += m->hasPath(taxfile);  }
1012                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + getOutputFileNameTag("taxonomy", taxfile); 
1013         outputTypes["taxonomy"].push_back(outputFileName);  outputNames.push_back(outputFileName);
1014         
1015                 ofstream out;
1016                 m->openOutputFile(outputFileName, out);
1017         
1018                 ifstream in;
1019                 m->openInputFile(taxfile, in);
1020                 string name, tax;
1021                 
1022                 if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
1023             
1024             vector<string> seqs; seqs.resize(names.size(), "");
1025             
1026             while(!in.eof()){
1027                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1028                 
1029                 in >> name;             m->gobble(in);          
1030                 in >> tax;    m->gobble(in);
1031                 
1032                 if (name != "") {
1033                     map<string, int>::iterator it = names.find(name);
1034                     if (it != names.end()) { //we found it, so put it in the vector in the right place.
1035                         seqs[it->second] = name + '\t' + tax;  
1036                     }else { //if we cant find it then add it to the end
1037                         names[name] = seqs.size();
1038                         seqs.push_back((name + '\t' + tax));
1039                         m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
1040                     }
1041                 }
1042             }
1043             in.close(); 
1044             
1045             int count = 0;
1046             for (int i = 0; i < seqs.size(); i++) {
1047                 if (seqs[i] != "") { out << seqs[i] << endl; count++; }
1048             }
1049             out.close();
1050             
1051             m->mothurOut("Ordered " + toString(count) + " sequences from " + taxfile + ".\n");
1052             
1053         }else { //read in file to fill names
1054             int count = 0;
1055             
1056             while(!in.eof()){
1057                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1058                 
1059                 in >> name;             m->gobble(in);          
1060                 in >> tax;    m->gobble(in);
1061                 
1062                 if (name != "") {
1063                     //if this name is in the accnos file
1064                     names[name] = count;
1065                     count++;
1066                     out << name << '\t' << tax << endl;
1067                 }
1068                 m->gobble(in);
1069             }
1070             in.close(); 
1071             out.close();
1072             
1073             m->mothurOut("\nUsing " + taxfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
1074         }
1075         
1076                 return 0;
1077                 return 0;
1078         }
1079         catch(exception& e) {
1080                 m->errorOut(e, "SortSeqsCommand", "readTax");
1081                 exit(1);
1082         }
1083 }
1084 //**********************************************************************************************************************
1085
1086
1087
1088
1089