]> git.donarmstrong.com Git - mothur.git/blob - sortseqscommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / sortseqscommand.cpp
1 //
2 //  sortseqscommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 2/3/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "sortseqscommand.h"
10 #include "sequence.hpp"
11 #include "qualityscores.h"
12
13 //**********************************************************************************************************************
14 vector<string> SortSeqsCommand::setParameters(){        
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
17         CommandParameter pflow("flow", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pflow);
18         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname);
19         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount);
20                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup);
21                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy);
22                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pqfile);
23                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
24                 CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos);
25         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "SortSeqsCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string SortSeqsCommand::getHelpString(){        
39         try {
40                 string helpString = "";
41                 helpString += "The sort.seqs command puts the sequences in the same order for the following file types: accnos fasta, name, group, count, taxonomy, flow or quality file.\n";
42         helpString += "The sort.seqs command parameters are accnos, fasta, name, group, count, taxonomy, flow, qfile and large.\n";
43         helpString += "The accnos file allows you to specify the order you want the files in.  If none is provided, mothur will use the order of the first file it reads.\n";
44         helpString += "The large parameters is used to indicate your files are too large to fit in RAM.\n";
45                 helpString += "The sort.seqs command should be in the following format: sort.seqs(fasta=yourFasta).\n";
46                 helpString += "Example sort.seqs(fasta=amazon.fasta).\n";
47                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
48                 return helpString;
49         }
50         catch(exception& e) {
51                 m->errorOut(e, "SortSeqsCommand", "getHelpString");
52                 exit(1);
53         }
54 }
55
56 //**********************************************************************************************************************
57 string SortSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ 
58         try {
59         string outputFileName = "";
60                 map<string, vector<string> >::iterator it;
61         
62         //is this a type this command creates
63         it = outputTypes.find(type);
64         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
65         else {
66             if (type == "fasta")            {   outputFileName =  "sorted" + m->getExtension(inputName);   }
67             else if (type == "taxonomy")    {   outputFileName =  "sorted" + m->getExtension(inputName);   }
68             else if (type == "name")        {   outputFileName =  "sorted" + m->getExtension(inputName);   }
69             else if (type == "count")       {   outputFileName =  "sorted" + m->getExtension(inputName);   }
70             else if (type == "group")       {   outputFileName =  "sorted" + m->getExtension(inputName);   }
71             else if (type == "flow")        {   outputFileName =  "sorted" + m->getExtension(inputName);   }
72             else if (type == "qfile")       {   outputFileName =  "sorted" + m->getExtension(inputName);   }
73             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
74         }
75         return outputFileName;
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "SortSeqsCommand", "getOutputFileNameTag");
79                 exit(1);
80         }
81 }
82
83 //**********************************************************************************************************************
84 SortSeqsCommand::SortSeqsCommand(){     
85         try {
86                 abort = true; calledHelp = true; 
87                 setParameters();
88                 vector<string> tempOutNames;
89                 outputTypes["fasta"] = tempOutNames;
90                 outputTypes["taxonomy"] = tempOutNames;
91                 outputTypes["name"] = tempOutNames;
92         outputTypes["count"] = tempOutNames;
93                 outputTypes["group"] = tempOutNames;
94                 outputTypes["qfile"] = tempOutNames;
95         outputTypes["flow"] = tempOutNames;
96         }
97         catch(exception& e) {
98                 m->errorOut(e, "SortSeqsCommand", "SortSeqsCommand");
99                 exit(1);
100         }
101 }
102 //**********************************************************************************************************************
103 SortSeqsCommand::SortSeqsCommand(string option)  {
104         try {
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
110                 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string,string> parameters = parser.getParameters();
116                         
117                         ValidParameters validParameter;
118                         map<string,string>::iterator it;
119                         
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126                         vector<string> tempOutNames;
127                         outputTypes["fasta"] = tempOutNames;
128                         outputTypes["taxonomy"] = tempOutNames;
129                         outputTypes["name"] = tempOutNames;
130                         outputTypes["group"] = tempOutNames;
131                         outputTypes["qfile"] = tempOutNames;
132             outputTypes["flow"] = tempOutNames;
133             outputTypes["count"] = tempOutNames;
134                         
135                         //if the user changes the output directory command factory will send this info to us in the output parameter 
136                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
137                         
138                         //if the user changes the input directory command factory will send this info to us in the output parameter 
139                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
140                         if (inputDir == "not found"){   inputDir = "";          }
141                         else {
142                                 string path;
143                                 it = parameters.find("fasta");
144                                 //user has given a template file
145                                 if(it != parameters.end()){ 
146                                         path = m->hasPath(it->second);
147                                         //if the user has not given a path then, add inputdir. else leave path alone.
148                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
149                                 }
150                                 
151                                 it = parameters.find("name");
152                                 //user has given a template file
153                                 if(it != parameters.end()){ 
154                                         path = m->hasPath(it->second);
155                                         //if the user has not given a path then, add inputdir. else leave path alone.
156                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
157                                 }
158                                 
159                                 it = parameters.find("group");
160                                 //user has given a template file
161                                 if(it != parameters.end()){ 
162                                         path = m->hasPath(it->second);
163                                         //if the user has not given a path then, add inputdir. else leave path alone.
164                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
165                                 }
166                                 
167                                 it = parameters.find("taxonomy");
168                                 //user has given a template file
169                                 if(it != parameters.end()){ 
170                                         path = m->hasPath(it->second);
171                                         //if the user has not given a path then, add inputdir. else leave path alone.
172                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
173                                 }
174                                 
175                                 it = parameters.find("qfile");
176                                 //user has given a template file
177                                 if(it != parameters.end()){ 
178                                         path = m->hasPath(it->second);
179                                         //if the user has not given a path then, add inputdir. else leave path alone.
180                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
181                                 }
182                 
183                 it = parameters.find("accnos");
184                                 //user has given a template file
185                                 if(it != parameters.end()){ 
186                                         path = m->hasPath(it->second);
187                                         //if the user has not given a path then, add inputdir. else leave path alone.
188                                         if (path == "") {       parameters["accnos"] = inputDir + it->second;           }
189                                 }
190                 
191                 it = parameters.find("flow");
192                                 //user has given a template file
193                                 if(it != parameters.end()){ 
194                                         path = m->hasPath(it->second);
195                                         //if the user has not given a path then, add inputdir. else leave path alone.
196                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
197                                 }
198                 
199                 it = parameters.find("count");
200                                 //user has given a template file
201                                 if(it != parameters.end()){ 
202                                         path = m->hasPath(it->second);
203                                         //if the user has not given a path then, add inputdir. else leave path alone.
204                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
205                                 }
206                         }
207             
208                         
209                         //check for parameters
210             accnosfile = validParameter.validFile(parameters, "accnos", true);
211                         if (accnosfile == "not open") { accnosfile = ""; abort = true; }
212                         else if (accnosfile == "not found") {  accnosfile = "";  }      
213                         else { m->setAccnosFile(accnosfile); }
214             
215                         fastafile = validParameter.validFile(parameters, "fasta", true);
216                         if (fastafile == "not open") { fastafile = ""; abort = true; }
217                         else if (fastafile == "not found") {  fastafile = "";  }        
218                         else { m->setFastaFile(fastafile); }
219             
220             flowfile = validParameter.validFile(parameters, "flow", true);
221                         if (flowfile == "not open") { flowfile = ""; abort = true; }
222                         else if (flowfile == "not found") {  flowfile = "";  }  
223                         else { m->setFlowFile(flowfile); }
224             
225                         namefile = validParameter.validFile(parameters, "name", true);
226                         if (namefile == "not open") { namefile = ""; abort = true; }
227                         else if (namefile == "not found") {  namefile = "";  }  
228                         else { m->setNameFile(namefile); } 
229             
230                         groupfile = validParameter.validFile(parameters, "group", true);
231                         if (groupfile == "not open") { abort = true; }
232                         else if (groupfile == "not found") {  groupfile = "";  }
233                         else { m->setGroupFile(groupfile); }
234                         
235                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
236                         if (taxfile == "not open") { abort = true; }
237                         else if (taxfile == "not found") {  taxfile = "";  }
238                         else { m->setTaxonomyFile(taxfile); }
239                         
240                         qualfile = validParameter.validFile(parameters, "qfile", true);
241                         if (qualfile == "not open") { abort = true; }
242                         else if (qualfile == "not found") {  qualfile = "";  }                  
243                         else { m->setQualFile(qualfile); }
244             
245             countfile = validParameter.validFile(parameters, "count", true);
246                         if (countfile == "not open") { countfile = ""; abort = true; }
247                         else if (countfile == "not found") { countfile = "";  } 
248                         else { m->setCountTableFile(countfile); }
249             
250             if ((namefile != "") && (countfile != "")) {
251                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
252             }
253                         
254             if ((groupfile != "") && (countfile != "")) {
255                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
256             }
257                         
258             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") { temp = "f"; }
259                         large = m->isTrue(temp);
260             
261                         if ((fastafile == "") && (namefile == "") && (countfile == "") && (groupfile == "") && (taxfile == "") && (flowfile == "") && (qualfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, group, count, taxonomy, flow or quality."); m->mothurOutEndLine(); abort = true; }
262                         
263             if (countfile == "") {
264                 if ((fastafile != "") && (namefile == "")) {
265                     vector<string> files; files.push_back(fastafile);
266                     parser.getNameFile(files);
267                 }
268             }
269                 }
270         
271         }
272         catch(exception& e) {
273                 m->errorOut(e, "SortSeqsCommand", "SortSeqsCommand");
274                 exit(1);
275         }
276 }
277 //**********************************************************************************************************************
278
279 int SortSeqsCommand::execute(){
280         try {
281                 
282                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
283                 
284                 //read through the correct file and output lines you want to keep
285         if (accnosfile != "")           {               
286             vector<string> temp;
287             m->readAccnos(accnosfile, temp);
288             for (int i = 0; i < temp.size(); i++) {  names[temp[i]] = i;  }
289             m->mothurOut("\nUsing " + accnosfile + " to determine the order. It contains " + toString(temp.size()) + " representative sequences.\n");   
290         }
291         
292                 if (fastafile != "")            {               readFasta();    }
293         if (flowfile != "")         {           readFlow();     }
294         if (qualfile != "")                     {               readQual();             }
295         if (namefile != "")                     {               readName();             }
296                 if (groupfile != "")            {               readGroup();    }
297         if (countfile != "")            {               readCount();    }
298         if (taxfile != "")                      {               readTax();              }
299                 
300                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
301         
302                 if (outputNames.size() != 0) {
303                         m->mothurOutEndLine();
304                         m->mothurOut("Output File Names: "); m->mothurOutEndLine();
305                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
306                         m->mothurOutEndLine();
307                         
308                         //set fasta file as new current fastafile
309                         string current = "";
310                         itTypes = outputTypes.find("fasta");
311                         if (itTypes != outputTypes.end()) {
312                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
313                         }
314                         
315                         itTypes = outputTypes.find("name");
316                         if (itTypes != outputTypes.end()) {
317                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
318                         }
319                         
320                         itTypes = outputTypes.find("group");
321                         if (itTypes != outputTypes.end()) {
322                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
323                         }
324                         
325                         
326                         itTypes = outputTypes.find("taxonomy");
327                         if (itTypes != outputTypes.end()) {
328                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
329                         }
330                         
331                         itTypes = outputTypes.find("qfile");
332                         if (itTypes != outputTypes.end()) {
333                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
334                         }       
335             
336             itTypes = outputTypes.find("flow");
337                         if (itTypes != outputTypes.end()) {
338                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFlowFile(current); }
339                         }
340             
341             itTypes = outputTypes.find("count");
342                         if (itTypes != outputTypes.end()) {
343                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
344                         }
345                 }
346                 
347                 return 0;               
348         }
349     
350         catch(exception& e) {
351                 m->errorOut(e, "SortSeqsCommand", "execute");
352                 exit(1);
353         }
354 }
355
356 //**********************************************************************************************************************
357 int SortSeqsCommand::readFasta(){
358         try {
359                 string thisOutputDir = outputDir;
360                 if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
361                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta", fastafile);
362                 outputTypes["fasta"].push_back(outputFileName);  outputNames.push_back(outputFileName);
363         
364                 ofstream out;
365                 m->openOutputFile(outputFileName, out);
366                 
367                 ifstream in;
368                 m->openInputFile(fastafile, in);
369                 string name;
370                 
371         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
372             
373             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
374                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
375                 //this way we only store 1000 seqs in memory at a time.
376                 
377                 int numNames = names.size();
378                 int numNamesInFile = 0;
379                 
380                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
381                 while(!in.eof()){
382                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
383                     
384                     Sequence currSeq(in);
385                     name = currSeq.getName();
386                     
387                     if (name != "") {
388                         numNamesInFile++;
389                         map<string, int>::iterator it = names.find(name);
390                         if (it == names.end()) { 
391                             names[name] = numNames; numNames++;
392                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
393                         }
394                     }
395                     m->gobble(in);
396                 }
397                 in.close();
398                 out.close();
399                 
400                 int numLeft = names.size();
401                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
402                 
403                 int size = 1000; //assume that user can hold 1000 seqs in memory
404                 if (numLeft < size) { size = numLeft; }
405                 int times = 0;
406                 
407                 vector<Sequence> seqs; seqs.resize(size);
408                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
409                 
410                 while (numLeft > 0) {
411                     
412                     ifstream in2;
413                     m->openInputFile(fastafile, in2);
414                     
415                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
416                     
417                     int found = 0;
418                     int needToFind = size;
419                     if (numLeft < size) { needToFind = numLeft; }
420                     
421                     while(!in2.eof()){
422                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
423                         
424                         //stop reading if we already found the seqs we are looking for
425                         if (found >= needToFind) { break; }
426                         
427                         Sequence currSeq(in2);
428                         name = currSeq.getName();
429                         
430                         if (name != "") {
431                             map<string, int>::iterator it = names.find(name);
432                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
433                                 //is it in the set of seqs we are looking for this time around
434                                 int thisSeqsPlace = it->second;
435                                 thisSeqsPlace -= (times * size);
436                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
437                                     seqs[thisSeqsPlace] = currSeq; 
438                                     found++;
439                                 }
440                             }else { m->mothurOut("[ERROR]: in logic of readFasta function.\n"); m->control_pressed = true; }
441                         }
442                         m->gobble(in2);
443                     }
444                     in2.close();        
445
446                     ofstream out2;
447                     m->openOutputFileAppend(outputFileName, out2);
448                     
449                     int output = seqs.size();
450                     if (numLeft < seqs.size()) { output = numLeft; }
451                         
452                     for (int i = 0; i < output; i++) {
453                         if (seqs[i].getName() != "") { seqs[i].printSequence(out2); }
454                     }
455                     out2.close();
456                     
457                     times++;
458                     numLeft -= output;
459                 }
460                 
461                 m->mothurOut("Ordered " + toString(numNamesInFile) + " sequences from " + fastafile + ".\n");
462             }else {
463                 
464                 vector<Sequence> seqs; seqs.resize(names.size());
465                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
466                 
467                 while(!in.eof()){
468                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
469                     
470                     Sequence currSeq(in);
471                     name = currSeq.getName();
472                     
473                     if (name != "") {
474                         map<string, int>::iterator it = names.find(name);
475                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
476                             seqs[it->second] = currSeq;  
477                         }else { //if we cant find it then add it to the end
478                             names[name] = seqs.size();
479                             seqs.push_back(currSeq);
480                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
481                         }
482                     }
483                     m->gobble(in);
484                 }
485                 in.close();     
486                 
487                 int count = 0;
488                 for (int i = 0; i < seqs.size(); i++) {
489                     if (seqs[i].getName() != "") {
490                         seqs[i].printSequence(out); count++;
491                     }
492                 }
493                 out.close();
494                 
495                 m->mothurOut("Ordered " + toString(count) + " sequences from " + fastafile + ".\n");
496             }
497                         
498         }else { //read in file to fill names
499             int count = 0;
500             
501             while(!in.eof()){
502                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
503                 
504                 Sequence currSeq(in);
505                 name = currSeq.getName();
506                 
507                 if (name != "") {
508                     //if this name is in the accnos file
509                     names[name] = count;
510                     count++;
511                     currSeq.printSequence(out);
512                 }
513                 m->gobble(in);
514             }
515             in.close(); 
516             out.close();
517             
518             m->mothurOut("\nUsing " + fastafile + " to determine the order. It contains " + toString(count) + " sequences.\n");
519         }
520                                 
521                 return 0;
522                 
523         }
524         catch(exception& e) {
525                 m->errorOut(e, "SortSeqsCommand", "readFasta");
526                 exit(1);
527         }
528 }
529 //**********************************************************************************************************************
530 int SortSeqsCommand::readFlow(){
531         try {
532                 string thisOutputDir = outputDir;
533                 if (outputDir == "") {  thisOutputDir += m->hasPath(flowfile);  }
534                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowfile)) + getOutputFileNameTag("flow", flowfile);
535                 outputTypes["flow"].push_back(outputFileName);  outputNames.push_back(outputFileName);
536         
537                 ofstream out;
538                 m->openOutputFile(outputFileName, out);
539                 
540                 ifstream in;
541                 m->openInputFile(flowfile, in);
542         int numFlows;
543                 string name;
544         
545         in >> numFlows; m->gobble(in);
546                 
547         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
548             
549             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
550                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
551                 //this way we only store 1000 seqs in memory at a time.
552                 
553                 int numNames = names.size();
554                 int numNamesInFile = 0;
555                 
556                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
557                 while(!in.eof()){
558                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
559                     
560                     in >> name; 
561                     string rest = m->getline(in);
562                     
563                     if (name != "") {
564                         numNamesInFile++;
565                         map<string, int>::iterator it = names.find(name);
566                         if (it == names.end()) { 
567                             names[name] = numNames; numNames++;
568                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
569                         }
570                     }
571                     m->gobble(in);
572                 }
573                 in.close();
574                 out.close();
575                 
576                 int numLeft = names.size();
577                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
578                 
579                 int size = 1000; //assume that user can hold 1000 seqs in memory
580                 if (numLeft < size) { size = numLeft; }
581                 int times = 0;
582                 
583                 vector<string> seqs; seqs.resize(size, "");
584                 
585                 while (numLeft > 0) {
586                     
587                     ifstream in2;
588                     m->openInputFile(flowfile, in2); in2 >> numFlows; m->gobble(in2);
589                     
590                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
591                     
592                     int found = 0;
593                     int needToFind = size;
594                     if (numLeft < size) { needToFind = numLeft; }
595                     
596                     while(!in2.eof()){
597                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
598                         
599                         //stop reading if we already found the seqs we are looking for
600                         if (found >= needToFind) { break; }
601                         
602                         in2 >> name;    
603                         string rest = m->getline(in2);
604                         
605                         if (name != "") {
606                             map<string, int>::iterator it = names.find(name);
607                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
608                                 //is it in the set of seqs we are looking for this time around
609                                 int thisSeqsPlace = it->second;
610                                 thisSeqsPlace -= (times * size);
611                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
612                                     seqs[thisSeqsPlace] = (name +'\t' + rest); 
613                                     found++;
614                                 }
615                             }else { m->mothurOut("[ERROR]: in logic of readFlow function.\n"); m->control_pressed = true; }
616                         }
617                         m->gobble(in2);
618                     }
619                     in2.close();        
620                     
621                     ofstream out2;
622                     m->openOutputFileAppend(outputFileName, out2);
623                     
624                     int output = seqs.size();
625                     if (numLeft < seqs.size()) { output = numLeft; }
626                     
627                     for (int i = 0; i < output; i++) {
628                         if (seqs[i] != "") {
629                             out2 << seqs[i] << endl;
630                         }
631                     }
632                     out2.close();
633                     
634                     times++;
635                     numLeft -= output;
636                 }
637                 
638                 m->mothurOut("Ordered " + toString(numNamesInFile) + " flows from " + flowfile + ".\n");
639             }else {
640                 
641                 vector<string> seqs; seqs.resize(names.size(), "");
642                 
643                 while(!in.eof()){
644                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
645                     
646                     in >> name; 
647                     string rest = m->getline(in);
648                     
649                     if (name != "") {
650                         map<string, int>::iterator it = names.find(name);
651                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
652                             seqs[it->second] = (name + '\t' + rest);  
653                         }else { //if we cant find it then add it to the end
654                             names[name] = seqs.size();
655                             seqs.push_back((name + '\t' + rest));
656                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
657                         }
658                     }
659                     m->gobble(in);
660                 }
661                 in.close();     
662                 
663                 int count = 0;
664                 for (int i = 0; i < seqs.size(); i++) {
665                     if (seqs[i] != "") {
666                         out << seqs[i] << endl;
667                         count++;
668                     }
669                 }
670                 out.close();
671                 
672                 m->mothurOut("Ordered " + toString(count) + " flows from " + flowfile + ".\n");
673             }
674             
675         }else { //read in file to fill names
676             int count = 0;
677             
678             while(!in.eof()){
679                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
680                 
681                 in >> name;     
682                 string rest = m->getline(in);
683                 
684                 if (name != "") {
685                     //if this name is in the accnos file
686                     names[name] = count;
687                     count++;
688                     out << name << '\t' << rest << endl;
689                 }
690                 m->gobble(in);
691             }
692             in.close(); 
693             out.close();
694             
695             m->mothurOut("\nUsing " + flowfile + " to determine the order. It contains " + toString(count) + " flows.\n");
696         }
697         
698                 return 0;
699                 
700         }
701         catch(exception& e) {
702                 m->errorOut(e, "SortSeqsCommand", "readFlow");
703                 exit(1);
704         }
705 }
706
707 //**********************************************************************************************************************
708 int SortSeqsCommand::readQual(){
709         try {
710                 string thisOutputDir = outputDir;
711                 if (outputDir == "") {  thisOutputDir += m->hasPath(qualfile);  }
712                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(qualfile)) + getOutputFileNameTag("qfile", qualfile);
713         outputTypes["qfile"].push_back(outputFileName);  outputNames.push_back(outputFileName);
714         
715                 ofstream out;
716                 m->openOutputFile(outputFileName, out);
717                 
718                 ifstream in;
719                 m->openInputFile(qualfile, in);
720                 string name;
721                 
722         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
723             
724             if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
725                 //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
726                 //this way we only store 1000 seqs in memory at a time.
727                 
728                 int numNames = names.size();
729                 int numNamesInFile = 0;
730                 
731                 //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
732                 while(!in.eof()){
733                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
734                     
735                     QualityScores currQual;
736                     currQual = QualityScores(in); 
737                     name = currQual.getName();
738                     
739                     if (name != "") {
740                         numNamesInFile++;
741                         map<string, int>::iterator it = names.find(name);
742                         if (it == names.end()) { 
743                             names[name] = numNames; numNames++;
744                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
745                         }
746                     }
747                     m->gobble(in);
748                 }
749                 in.close();
750                 out.close();
751                 
752                 int numLeft = names.size();
753                 if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
754                 
755                 int size = 1000; //assume that user can hold 1000 seqs in memory
756                 if (numLeft < size) { size = numLeft; }
757                 int times = 0;
758
759                 
760                 vector<QualityScores> seqs; seqs.resize(size);
761                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
762                 
763                 while (numLeft > 0) {
764                     
765                     ifstream in2;
766                     m->openInputFile(qualfile, in2);
767                     
768                     if (m->control_pressed) { in2.close();  m->mothurRemove(outputFileName);  return 0; }
769                     
770                     int found = 0;
771                     int needToFind = size;
772                     if (numLeft < size) { needToFind = numLeft; }
773                     
774                     while(!in2.eof()){
775                         if (m->control_pressed) { in2.close();   m->mothurRemove(outputFileName);  return 0; }
776                         
777                         //stop reading if we already found the seqs we are looking for
778                         if (found >= needToFind) { break; }
779                         
780                         QualityScores currQual;
781                         currQual = QualityScores(in2); 
782                         name = currQual.getName();
783                         
784                         if (name != "") {
785                             map<string, int>::iterator it = names.find(name);
786                             if (it != names.end()) { //we found it, so put it in the vector in the right place.
787                                 //is it in the set of seqs we are looking for this time around
788                                 int thisSeqsPlace = it->second;
789                                 thisSeqsPlace -= (times * size);
790                                 if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
791                                     seqs[thisSeqsPlace] = currQual; 
792                                     found++;
793                                 }
794                             }else { m->mothurOut("[ERROR]: in logic of readQual function.\n"); m->control_pressed = true; }
795                         }
796                         m->gobble(in2);
797                     }
798                     in2.close();        
799                     
800                     ofstream out2;
801                     m->openOutputFileAppend(outputFileName, out2);
802                     
803                     int output = seqs.size();
804                     if (numLeft < seqs.size()) { output = numLeft; }
805                     
806                     for (int i = 0; i < output; i++) {
807                         if (seqs[i].getName() != "") {
808                             seqs[i].printQScores(out2);
809                         }
810                     }
811                     out2.close();
812                     
813                     times++;
814                     numLeft -= output;
815                 }
816                 
817                  m->mothurOut("Ordered " + toString(numNamesInFile) + " sequences from " + qualfile + ".\n");
818                 
819             }else {
820                 
821                 vector<QualityScores> seqs; seqs.resize(names.size());
822                 for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
823                 
824                 while(!in.eof()){
825                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
826                     
827                     QualityScores currQual;
828                     currQual = QualityScores(in); 
829                     name = currQual.getName();
830                     
831                     if (name != "") {
832                         map<string, int>::iterator it = names.find(name);
833                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
834                             seqs[it->second] = currQual;  
835                         }else { //if we cant find it then add it to the end
836                             names[name] = seqs.size();
837                             seqs.push_back(currQual);
838                             m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
839                         }
840                     }
841                     m->gobble(in);
842                 }
843                 in.close();     
844                 
845                 int count = 0;
846                 for (int i = 0; i < seqs.size(); i++) {
847                     if (seqs[i].getName() != "") { seqs[i].printQScores(out); count++; }
848                 }
849                 out.close();
850                 
851                 m->mothurOut("Ordered " + toString(count) + " sequences from " + qualfile + ".\n");
852             }
853             
854         }else { //read in file to fill names
855             int count = 0;
856             
857             while(!in.eof()){
858                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
859                 
860                 QualityScores currQual;
861                 currQual = QualityScores(in);  
862                                
863                 m->gobble(in);
864                 
865                 if (currQual.getName() != "") {
866                     //if this name is in the accnos file
867                     names[currQual.getName()] = count;
868                     count++;
869                     currQual.printQScores(out);
870                 }
871                 m->gobble(in);
872             }
873             in.close(); 
874             out.close();
875             
876             m->mothurOut("\nUsing " + qualfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
877         }
878                 
879                 return 0;
880                 
881         }
882         catch(exception& e) {
883                 m->errorOut(e, "SortSeqsCommand", "readQual");
884                 exit(1);
885         }
886 }
887 //**********************************************************************************************************************
888 int SortSeqsCommand::readName(){
889         try {
890                 string thisOutputDir = outputDir;
891                 if (outputDir == "") {  thisOutputDir += m->hasPath(namefile);  }
892                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile); 
893         outputTypes["name"].push_back(outputFileName);  outputNames.push_back(outputFileName);
894         
895                 ofstream out;
896                 m->openOutputFile(outputFileName, out);
897         
898                 ifstream in;
899                 m->openInputFile(namefile, in);
900                 string name, firstCol, secondCol;
901                 
902         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
903         
904                 vector<string> seqs; seqs.resize(names.size(), "");
905                 
906                 while(!in.eof()){
907                     if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
908                     
909                     in >> firstCol;             m->gobble(in);          
910                     in >> secondCol;    m->gobble(in);
911                     
912                     if (firstCol != "") {
913                         map<string, int>::iterator it = names.find(firstCol);
914                         if (it != names.end()) { //we found it, so put it in the vector in the right place.
915                             seqs[it->second] = firstCol + '\t' + secondCol;  
916                         }else { //if we cant find it then add it to the end
917                             names[firstCol] = seqs.size();
918                             seqs.push_back((firstCol + '\t' + secondCol));
919                             m->mothurOut(firstCol + " was not in the contained the file which determined the order, adding it to the end.\n");
920                         }
921                     }
922                 }
923                 in.close();     
924                 
925                 int count = 0;
926                 for (int i = 0; i < seqs.size(); i++) {
927                     if (seqs[i] != "") { out << seqs[i] << endl; count++; }
928                 }
929                 out.close();
930                 
931                 m->mothurOut("Ordered " + toString(count) + " sequences from " + namefile + ".\n");
932             
933         }else { //read in file to fill names
934             int count = 0;
935             
936             while(!in.eof()){
937                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
938                 
939                 in >> firstCol;         m->gobble(in);          
940                 in >> secondCol;    m->gobble(in);
941                 
942                 if (firstCol != "") {
943                     //if this name is in the accnos file
944                     names[firstCol] = count;
945                     count++;
946                     out << firstCol << '\t' << secondCol << endl;
947                 }
948                 m->gobble(in);
949             }
950             in.close(); 
951             out.close();
952             
953             m->mothurOut("\nUsing " + namefile + " to determine the order. It contains " + toString(count) + " representative sequences.\n");
954         }
955                                 
956                 return 0;
957         }
958         catch(exception& e) {
959                 m->errorOut(e, "SortSeqsCommand", "readName");
960                 exit(1);
961         }
962 }
963 //**********************************************************************************************************************
964 int SortSeqsCommand::readCount(){
965         try {
966                 string thisOutputDir = outputDir;
967                 if (outputDir == "") {  thisOutputDir += m->hasPath(countfile);  }
968                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); 
969         outputTypes["count"].push_back(outputFileName);  outputNames.push_back(outputFileName);
970         
971                 ofstream out;
972                 m->openOutputFile(outputFileName, out);
973         
974                 ifstream in;
975                 m->openInputFile(countfile, in);
976                 string firstCol, rest;
977                 
978         if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
979             
980             vector<string> seqs; seqs.resize(names.size(), "");
981             
982             string headers = m->getline(in); m->gobble(in);
983             
984             while(!in.eof()){
985                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
986                 
987                 in >> firstCol;         m->gobble(in);          
988                 rest = m->getline(in);    m->gobble(in);
989                 
990                 if (firstCol != "") {
991                     map<string, int>::iterator it = names.find(firstCol);
992                     if (it != names.end()) { //we found it, so put it in the vector in the right place.
993                         seqs[it->second] = firstCol + '\t' + rest;  
994                     }else { //if we cant find it then add it to the end
995                         names[firstCol] = seqs.size();
996                         seqs.push_back((firstCol + '\t' + rest));
997                         m->mothurOut(firstCol + " was not in the contained the file which determined the order, adding it to the end.\n");
998                     }
999                 }
1000             }
1001             in.close(); 
1002             
1003             int count = 0;
1004             out << headers << endl;
1005             for (int i = 0; i < seqs.size(); i++) {
1006                 if (seqs[i] != "") { out << seqs[i] << endl; count++; }
1007             }
1008             out.close();
1009             
1010             m->mothurOut("Ordered " + toString(count) + " sequences from " + countfile + ".\n");
1011             
1012         }else { //read in file to fill names
1013             int count = 0;
1014             
1015             string headers = m->getline(in); m->gobble(in);
1016             out << headers << endl;
1017             
1018             while(!in.eof()){
1019                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1020                 
1021                 in >> firstCol;         m->gobble(in);          
1022                 rest = m->getline(in);  m->gobble(in);
1023                 
1024                 if (firstCol != "") {
1025                     //if this name is in the accnos file
1026                     names[firstCol] = count;
1027                     count++;
1028                     out << firstCol << '\t' << rest << endl;
1029                 }
1030                 m->gobble(in);
1031             }
1032             in.close(); 
1033             out.close();
1034             
1035             m->mothurOut("\nUsing " + countfile + " to determine the order. It contains " + toString(count) + " representative sequences.\n");
1036         }
1037         
1038                 return 0;
1039         }
1040         catch(exception& e) {
1041                 m->errorOut(e, "SortSeqsCommand", "readCount");
1042                 exit(1);
1043         }
1044 }
1045 //**********************************************************************************************************************
1046 int SortSeqsCommand::readGroup(){
1047         try {
1048                 string thisOutputDir = outputDir;
1049                 if (outputDir == "") {  thisOutputDir += m->hasPath(groupfile);  }
1050                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile); 
1051         outputTypes["group"].push_back(outputFileName);  outputNames.push_back(outputFileName);
1052         
1053                 ofstream out;
1054                 m->openOutputFile(outputFileName, out);
1055         
1056                 ifstream in;
1057                 m->openInputFile(groupfile, in);
1058                 string name, group;
1059                 
1060                 if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
1061             
1062             vector<string> seqs; seqs.resize(names.size(), "");
1063             
1064             while(!in.eof()){
1065                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1066                 
1067                 in >> name;             m->gobble(in);          
1068                 in >> group;    m->gobble(in);
1069                 
1070                 if (name != "") {
1071                     map<string, int>::iterator it = names.find(name);
1072                     if (it != names.end()) { //we found it, so put it in the vector in the right place.
1073                         seqs[it->second] = name + '\t' + group;  
1074                     }else { //if we cant find it then add it to the end
1075                         names[name] = seqs.size();
1076                         seqs.push_back((name + '\t' + group));
1077                         m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
1078                     }
1079                 }
1080             }
1081             in.close(); 
1082             
1083             int count = 0;
1084             for (int i = 0; i < seqs.size(); i++) {
1085                 if (seqs[i] != "") { out << seqs[i] << endl; count++; }
1086             }
1087             out.close();
1088             
1089             m->mothurOut("Ordered " + toString(count) + " sequences from " + groupfile + ".\n");
1090             
1091         }else { //read in file to fill names
1092             int count = 0;
1093             
1094             while(!in.eof()){
1095                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1096                 
1097                 in >> name;             m->gobble(in);          
1098                 in >> group;    m->gobble(in);
1099                 
1100                 if (name != "") {
1101                     //if this name is in the accnos file
1102                     names[name] = count;
1103                     count++;
1104                     out << name << '\t' << group << endl;
1105                 }
1106                 m->gobble(in);
1107             }
1108             in.close(); 
1109             out.close();
1110             
1111             m->mothurOut("\nUsing " + groupfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
1112         }
1113         
1114                 return 0;
1115         }
1116         catch(exception& e) {
1117                 m->errorOut(e, "SortSeqsCommand", "readGroup");
1118                 exit(1);
1119         }
1120 }
1121 //**********************************************************************************************************************
1122 int SortSeqsCommand::readTax(){
1123         try {
1124                 string thisOutputDir = outputDir;
1125                 if (outputDir == "") {  thisOutputDir += m->hasPath(taxfile);  }
1126                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + getOutputFileNameTag("taxonomy", taxfile); 
1127         outputTypes["taxonomy"].push_back(outputFileName);  outputNames.push_back(outputFileName);
1128         
1129                 ofstream out;
1130                 m->openOutputFile(outputFileName, out);
1131         
1132                 ifstream in;
1133                 m->openInputFile(taxfile, in);
1134                 string name, tax;
1135                 
1136                 if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
1137             
1138             vector<string> seqs; seqs.resize(names.size(), "");
1139             
1140             while(!in.eof()){
1141                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1142                 
1143                 in >> name;             m->gobble(in);          
1144                 in >> tax;    m->gobble(in);
1145                 
1146                 if (name != "") {
1147                     map<string, int>::iterator it = names.find(name);
1148                     if (it != names.end()) { //we found it, so put it in the vector in the right place.
1149                         seqs[it->second] = name + '\t' + tax;  
1150                     }else { //if we cant find it then add it to the end
1151                         names[name] = seqs.size();
1152                         seqs.push_back((name + '\t' + tax));
1153                         m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
1154                     }
1155                 }
1156             }
1157             in.close(); 
1158             
1159             int count = 0;
1160             for (int i = 0; i < seqs.size(); i++) {
1161                 if (seqs[i] != "") { out << seqs[i] << endl; count++; }
1162             }
1163             out.close();
1164             
1165             m->mothurOut("Ordered " + toString(count) + " sequences from " + taxfile + ".\n");
1166             
1167         }else { //read in file to fill names
1168             int count = 0;
1169             
1170             while(!in.eof()){
1171                 if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
1172                 
1173                 in >> name;             m->gobble(in);          
1174                 in >> tax;    m->gobble(in);
1175                 
1176                 if (name != "") {
1177                     //if this name is in the accnos file
1178                     names[name] = count;
1179                     count++;
1180                     out << name << '\t' << tax << endl;
1181                 }
1182                 m->gobble(in);
1183             }
1184             in.close(); 
1185             out.close();
1186             
1187             m->mothurOut("\nUsing " + taxfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
1188         }
1189         
1190                 return 0;
1191                 return 0;
1192         }
1193         catch(exception& e) {
1194                 m->errorOut(e, "SortSeqsCommand", "readTax");
1195                 exit(1);
1196         }
1197 }
1198 //**********************************************************************************************************************
1199
1200
1201
1202
1203