]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
paralellized parsimony, unifrac.unweighted, phylo.diversity, indicator commands for...
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
14
15 /***********************************************************************/
16
17 SplitMatrix::SplitMatrix(string distfile, string name, string count, string tax, float c, string t, bool l){
18         m = MothurOut::getInstance();
19         distFile = distfile;
20         cutoff = c;
21         namefile = name;
22         method = t;
23         taxFile = tax;
24     countfile = count;
25         large = l;
26 }
27 /***********************************************************************/
28
29 SplitMatrix::SplitMatrix(string ffile, string name, string count, string tax, float c, float cu, string t, int p, bool cl, string output){
30         m = MothurOut::getInstance();
31         fastafile = ffile;
32         namefile = name;
33     countfile = count;
34         taxFile = tax;
35         cutoff = c;  //tax level cutoff
36         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
37         method = t;
38         processors = p;
39     classic = cl;
40         outputDir = output;
41 }
42
43 /***********************************************************************/
44
45 int SplitMatrix::split(){
46         try {
47         
48                 if (method == "distance") {  
49                         splitDistance();
50                 }else if ((method == "classify") || (method == "fasta")) {
51                         splitClassify();
52                 }else {
53                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
54                         map<string, string> temp;
55                         if (namefile != "") {  temp[distFile] = namefile; }
56             else { temp[distFile] = countfile; }
57                         dists.push_back(temp);
58                 }
59                 
60                 return 0;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "SplitMatrix", "split");
64                 exit(1);
65         }
66 }
67 /***********************************************************************/
68 int SplitMatrix::splitDistance(){
69         try {
70         
71                 if (large)      { splitDistanceLarge(); }
72                 else            { splitDistanceRAM();   }
73                 
74                 return 0;
75                         
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "SplitMatrix", "splitDistance");
79                 exit(1);
80         }
81 }
82
83 /***********************************************************************/
84 int SplitMatrix::splitClassify(){
85         try {
86                 cutoff = int(cutoff);
87                                 
88                 map<string, int> seqGroup;
89                 map<string, int>::iterator it;
90                 map<string, int>::iterator it2;
91                 
92                 int numGroups = 0;
93                 
94                 //build tree from users taxonomy file
95                 PhyloTree* phylo = new PhyloTree();
96                 
97         map<string, string> temp;
98         m->readTax(taxFile, temp);
99         
100         for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
101             phylo->addSeqToTree(itTemp->first, itTemp->second);
102             temp.erase(itTemp++);
103         }
104                 
105                 phylo->assignHeirarchyIDs(0);
106
107                 //make sure the cutoff is not greater than maxlevel
108                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
109         
110                 //for each node in tree
111                 for (int i = 0; i < phylo->getNumNodes(); i++) {
112                 
113                         //is this node within the cutoff
114                         TaxNode taxon = phylo->get(i);
115         
116                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
117                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
118                                         for (int j = 0; j < taxon.accessions.size(); j++) {
119                                                 seqGroup[taxon.accessions[j]] = numGroups;
120                                         }
121                                         numGroups++;
122                                 }
123                         }
124                 }
125         
126                 delete phylo;
127                 
128                 if (method == "classify") {
129                         splitDistanceFileByTax(seqGroup, numGroups);
130                 }else {
131                         createDistanceFilesFromTax(seqGroup, numGroups);
132                 }
133                 
134                 return 0;
135                         
136         }
137         catch(exception& e) {
138                 m->errorOut(e, "SplitMatrix", "splitClassify");
139                 exit(1);
140         }
141 }
142 /***********************************************************************/
143 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
144         try {
145                 map<string, int> copyGroups = seqGroup;
146                 map<string, int>::iterator it;
147                 set<string> names;
148                                 
149                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
150                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
151                 }
152                         
153                 ifstream in;
154                 m->openInputFile(fastafile, in);
155         
156                 //parse fastafile
157                 ofstream outFile;
158                 while (!in.eof()) {
159                         Sequence query(in); m->gobble(in);
160                         if (query.getName() != "") {
161                 
162                                 it = seqGroup.find(query.getName());
163                                 
164                                 //save names in case no namefile is given
165                                 if ((namefile == "") && (countfile == "")) {  names.insert(query.getName()); }
166                         
167                                 if (it != seqGroup.end()) { //not singleton 
168                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
169                                         query.printSequence(outFile); 
170                                         outFile.close();
171                                         
172                                         copyGroups.erase(query.getName());
173                                 }
174                         }
175                 }
176                 in.close();
177                 
178                 //warn about sequence in groups that are not in fasta file
179                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
180                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
181                         exit(1);
182                 }
183                 
184                 copyGroups.clear();
185                 
186                 //process each distance file
187                 for (int i = 0; i < numGroups; i++) { 
188                         
189                         string options = "";
190             if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
191             else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
192                         if (outputDir != "") { options += ", outputdir=" + outputDir; }
193                         
194             m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
195             
196                         Command* command = new DistanceCommand(options);
197                         
198             m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
199             
200                         command->execute();
201                         delete command;
202                         
203                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
204                         
205                         //remove old names files just in case
206                         if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); }
207             else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); }
208                 }
209         
210         vector<string> tempDistFiles;    
211         for(int i=0;i<numGroups;i++){
212             if (outputDir == "") { outputDir = m->hasPath(fastafile); }
213             string tempDistFile = "";
214             if (classic) { tempDistFile =  outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
215             else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
216             tempDistFiles.push_back(tempDistFile);
217         }
218         
219         splitNames(seqGroup, numGroups, tempDistFiles);
220         
221                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
222                 
223                 return 0;
224         }
225         catch(exception& e) {
226                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
227                 exit(1);
228         }
229 }
230 /***********************************************************************/
231 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
232         try {
233                 map<string, int>::iterator it;
234                 map<string, int>::iterator it2;
235                 
236         ofstream outFile;
237                 ifstream dFile;
238                 m->openInputFile(distFile, dFile);
239                 
240                 
241                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
242                         m->mothurRemove((distFile + "." + toString(i) + ".temp"));
243                 }
244                 
245                 //for buffering the io to improve speed
246                  //allow for 10 dists to be stored, then output.
247                 vector<string> outputs;  outputs.resize(numGroups, "");
248                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
249                 
250                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
251                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
252                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
253                 
254                 //for each distance
255                 while(dFile){
256                         string seqA, seqB;
257                         float dist;
258                         
259                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp"));       } }
260                         
261                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
262                         
263                         //if both sequences are in the same group then they are within the cutoff
264                         it = seqGroup.find(seqA);
265                         it2 = seqGroup.find(seqB);
266                         
267                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
268                                 if (it->second == it2->second) { //they are from the same group so add the distance
269                                         if (numOutputs[it->second] > 30) {
270                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
271                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
272                                                 outFile.close();
273                                                 outputs[it->second] = "";
274                                                 numOutputs[it->second] = 0;
275                                                 validDistances[it->second] = true;
276                                         }else{
277                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
278                                                 numOutputs[it->second]++;
279                                         }
280                                 }
281                         }
282                 }
283                 dFile.close();
284         
285         string inputFile = namefile;
286         if (countfile != "") { inputFile = countfile; }
287         
288         vector<string> tempDistFiles;
289                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
290             string tempDistFile = distFile + "." + toString(i) + ".temp";
291             tempDistFiles.push_back(tempDistFile);
292                         m->mothurRemove((inputFile + "." + toString(i) + ".temp"));
293                         
294                         //write out any remaining buffers
295                         if (numOutputs[i] > 0) {
296                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
297                                 outFile << outputs[i];
298                                 outFile.close();
299                                 outputs[i] = "";
300                                 numOutputs[i] = 0;
301                                 validDistances[i] = true;
302                         }
303                 }
304                 
305         splitNames(seqGroup, numGroups, tempDistFiles);
306         
307                 if (m->control_pressed)  {  
308                         for (int i = 0; i < dists.size(); i++) { 
309                                 m->mothurRemove((dists[i].begin()->first));
310                                 m->mothurRemove((dists[i].begin()->second));
311                         }
312                         dists.clear();
313                 }
314                 
315                 return 0;
316         }
317         catch(exception& e) {
318                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
319                 exit(1);
320         }
321 }
322 /***********************************************************************/
323 int SplitMatrix::splitDistanceLarge(){
324         try {
325                 vector<set<string> > groups;
326                 
327                 //for buffering the io to improve speed
328                  //allow for 30 dists to be stored, then output.
329                 vector<string> outputs;
330                 vector<int> numOutputs;
331                 vector<bool> wroteOutPut;
332                 
333                 int numGroups = 0;
334
335                 //ofstream outFile;
336                 ifstream dFile;
337                 m->openInputFile(distFile, dFile);
338         
339                 while(dFile){
340                         string seqA, seqB;
341                         float dist;
342
343                         dFile >> seqA >> seqB >> dist;
344                         
345                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
346                                         
347                         if(dist < cutoff){
348                                 //cout << "in cutoff: " << dist << endl;
349                                 int groupIDA = -1;
350                                 int groupIDB = -1;
351                                 int groupID = -1;
352                                 
353                                 for(int i=0;i<numGroups;i++){
354                                         set<string>::iterator aIt = groups[i].find(seqA);
355                                         set<string>::iterator bIt = groups[i].find(seqB);
356                                         
357                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
358                                                 groups[i].insert(seqB);
359                                                 groupIDA = i;
360                                                 groupID = groupIDA;
361
362                                                 //cout << "in aIt: " << groupID << endl;
363         //                                      break;
364                                         }
365                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
366                                                 groups[i].insert(seqA);
367                                                 groupIDB = i;
368                                                 groupID = groupIDB;
369
370                                         //      cout << "in bIt: " << groupID << endl;
371         //                                      break;
372                                         }
373                                 
374                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
375                                                 if(groupIDA < groupIDB){
376                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
377                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
378                                                         groups[groupIDB].clear(); 
379                                                         groupID = groupIDA;
380                                                 }
381                                                 else{
382                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
383                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
384                                                         groups[groupIDA].clear();  
385                                                         groupID = groupIDB;
386                                                 }
387                                                 break;
388                                         }
389                                 }
390                                 
391         //windows is gonna gag on the reuse of outFile, will need to make it local...
392                                 
393                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
394                                         set<string> newGroup;
395                                         newGroup.insert(seqA);
396                                         newGroup.insert(seqB);
397                                         groups.push_back(newGroup);
398                                                                         
399                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
400                                         outputs.push_back(tempOut);
401                                         numOutputs.push_back(1);
402                                         wroteOutPut.push_back(false);
403                                         
404                                         numGroups++;
405                                 }
406                                 else{
407                                         string fileName = distFile + "." + toString(groupID) + ".temp";
408                                                                                         
409                                         //have we reached the max buffer size
410                                         if (numOutputs[groupID] > 60) { //write out sequence
411                         ofstream outFile;
412                                                 outFile.open(fileName.c_str(), ios::app);
413                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
414                                                 outFile.close();
415                                                 
416                                                 outputs[groupID] = "";
417                                                 numOutputs[groupID] = 0;
418                                                 wroteOutPut[groupID] = true;
419                                         }else {
420                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
421                                                 numOutputs[groupID]++;
422                                         }
423                                         
424                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
425                                                 string row, column, distance;
426                                                 if(groupIDA<groupIDB){
427                                                         
428                                                         //merge memory
429                                                         numOutputs[groupID] += numOutputs[groupIDB];
430                                                         outputs[groupID] += outputs[groupIDB];
431                                                         
432                                                         outputs[groupIDB] = "";
433                                                         numOutputs[groupIDB] = 0;
434                                                         
435                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
436                                                         if (wroteOutPut[groupIDB]) {
437                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
438                                                                 /*ifstream fileB(fileName2.c_str(), ios::ate);
439                                                                 
440                                                                 outFile.open(fileName.c_str(), ios::app);
441                                                                 
442                                                                 long size;
443                                                                 char* memblock;
444
445                                                                 size = fileB.tellg();
446                                 
447                                                                 fileB.seekg (0, ios::beg);
448                                                                 
449                                                                 int numRead = size / 1024;
450                                                                 int lastRead = size % 1024;
451
452                                                                 for (int i = 0; i < numRead; i++) {
453                                 
454                                                                         memblock = new char [1024];
455                                                                 
456                                                                         fileB.read (memblock, 1024);
457                                                                         
458                                                                         string temp = memblock;
459                                                                         outFile << temp.substr(0, 1024);
460                                                                         
461                                                                         delete memblock;
462                                                                 }
463                                                                 
464                                                                 memblock = new char [lastRead];
465                                                                 
466                                                                 fileB.read (memblock, lastRead);
467                                                                 
468                                                                 //not sure why but it will read more than lastRead char...??
469                                                                 string temp = memblock;
470                                                                 outFile << temp.substr(0, lastRead);
471                                                                 delete memblock;
472                                                                 
473                                                                 fileB.close();*/
474                                 m->appendFiles(fileName2, fileName);
475                                                                 m->mothurRemove(fileName2);
476                         
477                                                                 
478                                                                 //write out the merged memory
479                                                                 if (numOutputs[groupID] > 60) {
480                                     ofstream tempOut;
481                                     m->openOutputFile(fileName, tempOut);
482                                                                         tempOut << outputs[groupID];
483                                                                         outputs[groupID] = "";
484                                                                         numOutputs[groupID] = 0;
485                                     tempOut.close();
486                                                                 }
487                                                                 
488                                                                 //outFile.close();
489                                                                 
490                                                                 wroteOutPut[groupID] = true;
491                                                                 wroteOutPut[groupIDB] = false;
492                                                         }else{ } //just merge b's memory with a's memory 
493                                                 }
494                                                 else{
495                                                         numOutputs[groupID] += numOutputs[groupIDA];
496                                                         outputs[groupID] += outputs[groupIDA];
497                                                         
498                                                         outputs[groupIDA] = "";
499                                                         numOutputs[groupIDA] = 0;
500                                                         
501                                                         if (wroteOutPut[groupIDA]) {
502                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
503                                                                 /*ifstream fileB(fileName2.c_str(), ios::ate);
504                                                                 
505                                                                 outFile.open(fileName.c_str(), ios::app);
506                                                                 
507                                                                 long size;
508                                                                 char* memblock;
509
510                                                                 size = fileB.tellg();
511                                                                                                                         
512                                                                 fileB.seekg (0, ios::beg);
513                                                                 
514                                                                 int numRead = size / 1024;
515                                                                 int lastRead = size % 1024;
516
517                                                                 for (int i = 0; i < numRead; i++) {
518                                 
519                                                                         memblock = new char [1024];
520                                                                 
521                                                                         fileB.read (memblock, 1024);
522                                                                         string temp = memblock;
523                                                                         outFile << temp.substr(0, 1024);
524                                                                         
525                                                                         delete memblock;
526                                                                 }
527                                                                 
528                                                                 memblock = new char [lastRead];
529                                                                 
530                                                                 fileB.read (memblock, lastRead);
531                                                                 
532                                                                 //not sure why but it will read more than lastRead char...??
533                                                                 string temp = memblock;
534                                                                 outFile << temp.substr(0, lastRead);
535                                                                         
536                                                                 delete memblock;
537                                                                 
538                                                                 fileB.close();*/
539                                 m->appendFiles(fileName2, fileName);
540                                                                 m->mothurRemove(fileName2);
541                                                                 
542                                                                 //write out the merged memory
543                                                                 if (numOutputs[groupID] > 60) {
544                                     ofstream tempOut;
545                                     m->openOutputFile(fileName, tempOut);
546                                                                         tempOut << outputs[groupID];
547                                                                         outputs[groupID] = "";
548                                                                         numOutputs[groupID] = 0;
549                                     tempOut.close();
550                                                                 }
551                                                                 
552                                                                 //outFile.close();
553                                                                 
554                                                                 wroteOutPut[groupID] = true;
555                                                                 wroteOutPut[groupIDA] = false;
556                                                         }else { } //just merge memory
557                                                 }                                       
558                                         }
559                                 }
560                         }
561                         m->gobble(dFile);
562                 }
563                 dFile.close();
564         
565                 vector<string> tempDistFiles;
566                 for (int i = 0; i < numGroups; i++) {
567             string fileName = distFile + "." + toString(i) + ".temp";
568             tempDistFiles.push_back(fileName);
569             //remove old names files just in case
570                         
571                         if (numOutputs[i] > 0) {
572                 ofstream outFile;
573                                 outFile.open(fileName.c_str(), ios::app);
574                                 outFile << outputs[i];
575                                 outFile.close();
576                         }
577                 }
578         
579         map<string, int> seqGroup;
580         for (int i = 0; i < groups.size(); i++) {
581             for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
582                 seqGroup[*itNames] = i;
583                 groups[i].erase(itNames++);
584             }
585         }
586         
587                 splitNames(seqGroup, numGroups, tempDistFiles);
588                                 
589                 return 0;                       
590         }
591         catch(exception& e) {
592                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
593                 exit(1);
594         }
595 }
596 //********************************************************************************************************************
597 int SplitMatrix::splitNames(map<string, int>& seqGroup, int numGroups, vector<string>& tempDistFiles){
598         try {
599         ofstream outFile;
600         map<string, int>::iterator it;
601         
602         string inputFile = namefile;
603         if (countfile != "") { inputFile = countfile; }
604         
605         for(int i=0;i<numGroups;i++){  m->mothurRemove((inputFile + "." + toString(i) + ".temp")); }
606
607         singleton = inputFile + ".extra.temp";
608         ofstream remainingNames;
609         m->openOutputFile(singleton, remainingNames);
610         
611         bool wroteExtra = false;
612         
613         ifstream bigNameFile;
614         m->openInputFile(inputFile, bigNameFile);
615         
616         //grab header line 
617         string headers = "";
618         if (countfile != "") { headers = m->getline(bigNameFile); m->gobble(bigNameFile); }
619         
620         string name, nameList;
621         while(!bigNameFile.eof()){
622             bigNameFile >> name >> nameList;  
623             m->getline(bigNameFile); m->gobble(bigNameFile); //extra getline is for rest of countfile line if groups are given.
624             
625             //did this sequence get assigned a group
626             it = seqGroup.find(name);
627             
628             if (it != seqGroup.end()) {  
629                 m->openOutputFileAppend((inputFile + "." + toString(it->second) + ".temp"), outFile);
630                 outFile << name << '\t' << nameList << endl;
631                 outFile.close();
632             }else{
633                 wroteExtra = true;
634                 remainingNames << name << '\t' << nameList << endl;
635             }
636         }
637         bigNameFile.close();
638         
639                 for(int i=0;i<numGroups;i++){
640                         string tempNameFile = inputFile + "." + toString(i) + ".temp";
641                         string tempDistFile = tempDistFiles[i];
642             
643             //if there are valid distances
644             ifstream fileHandle;
645             fileHandle.open(tempDistFile.c_str());
646             if(fileHandle)      {       
647                 m->gobble(fileHandle);
648                 if (!fileHandle.eof()) {  //check
649                                 map<string, string> temp;
650                 if (countfile != "") {
651                     //add header
652                     ofstream out;
653                     string newtempNameFile = tempNameFile + "2";
654                     m->openOutputFile(newtempNameFile, out);
655                     out << "Representative_Sequence\ttotal" << endl;
656                     out.close();
657                     m->appendFiles(tempNameFile, newtempNameFile);
658                     m->mothurRemove(tempNameFile);
659                     m->renameFile(newtempNameFile, tempNameFile);
660                 }
661                                 temp[tempDistFile] = tempNameFile;
662                                 dists.push_back(temp);
663                         }else{
664                                 ifstream in;
665                                 m->openInputFile(tempNameFile, in);
666                                 
667                                 while(!in.eof()) { 
668                                         in >> name >> nameList;  m->gobble(in);
669                                         wroteExtra = true;
670                                         remainingNames << name << '\t' << nameList << endl;
671                                 }
672                                 in.close();
673                                 m->mothurRemove(tempNameFile);
674                         }
675             }
676             fileHandle.close();
677                 }
678                 
679                 remainingNames.close();
680                 
681                 if (!wroteExtra) { 
682                         m->mothurRemove(singleton);
683                         singleton = "none";
684                 }else if (countfile != "") {
685             //add header
686             ofstream out;
687             string newtempNameFile = singleton + "2";
688             m->openOutputFile(newtempNameFile, out);
689             out << "Representative_Sequence\ttotal" << endl; 
690             out.close();
691             m->appendFiles(singleton, newtempNameFile);
692             m->mothurRemove(singleton);
693             m->renameFile(newtempNameFile, singleton);
694         }
695                 
696                 return 0;
697         }
698         catch(exception& e) {
699                 m->errorOut(e, "SplitMatrix", "splitNames");
700                 exit(1);
701         }
702 }
703 //********************************************************************************************************************
704 int SplitMatrix::splitDistanceRAM(){
705         try {
706                 vector<set<string> > groups;
707                 vector<string> outputs;
708                 
709                 int numGroups = 0;
710
711                 ifstream dFile;
712                 m->openInputFile(distFile, dFile);
713
714                 while(dFile){
715                         string seqA, seqB;
716                         float dist;
717
718                         dFile >> seqA >> seqB >> dist;
719                         
720                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
721                                         
722                         if(dist < cutoff){
723                                 //cout << "in cutoff: " << dist << endl;
724                                 int groupIDA = -1;
725                                 int groupIDB = -1;
726                                 int groupID = -1;
727                                 
728                                 for(int i=0;i<numGroups;i++){
729                                         set<string>::iterator aIt = groups[i].find(seqA);
730                                         set<string>::iterator bIt = groups[i].find(seqB);
731                                         
732                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
733                                                 groups[i].insert(seqB);
734                                                 groupIDA = i;
735                                                 groupID = groupIDA;
736
737                                                 //cout << "in aIt: " << groupID << endl;
738         //                                      break;
739                                         }
740                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
741                                                 groups[i].insert(seqA);
742                                                 groupIDB = i;
743                                                 groupID = groupIDB;
744
745                                         //      cout << "in bIt: " << groupID << endl;
746         //                                      break;
747                                         }
748                                 
749                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
750                                                 if(groupIDA < groupIDB){
751                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
752                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
753                                                         groups[groupIDB].clear(); 
754                                                         groupID = groupIDA;
755                                                 }
756                                                 else{
757                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
758                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
759                                                         groups[groupIDA].clear();  
760                                                         groupID = groupIDB;
761                                                 }
762                                                 break;
763                                         }
764                                 }
765                                 
766         //windows is gonna gag on the reuse of outFile, will need to make it local...
767                                 
768                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
769                                         set<string> newGroup;
770                                         newGroup.insert(seqA);
771                                         newGroup.insert(seqB);
772                                         groups.push_back(newGroup);
773                                                                         
774                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
775                                         outputs.push_back(tempOut);
776                                         numGroups++;
777                                 }
778                                 else{
779                                                                                         
780                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
781                                         
782                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
783                                                 string row, column, distance;
784                                                 if(groupIDA<groupIDB){
785                                                         //merge memory
786                                                         outputs[groupID] += outputs[groupIDB];
787                                                         outputs[groupIDB] = "";
788                                                 }else{
789                                                         outputs[groupID] += outputs[groupIDA];
790                                                         outputs[groupIDA] = "";
791                                                 }                                       
792                                         }
793                                 }
794                         }
795                         m->gobble(dFile);
796                 }
797                 dFile.close();
798                 
799         vector<string> tempDistFiles;
800                 for (int i = 0; i < numGroups; i++) {
801             string fileName = distFile + "." + toString(i) + ".temp";
802             tempDistFiles.push_back(fileName);
803                         if (outputs[i] != "") {
804                                 ofstream outFile;
805                                 outFile.open(fileName.c_str(), ios::ate);
806                                 outFile << outputs[i];
807                                 outFile.close();
808                         }
809                 }
810         
811         map<string, int> seqGroup;
812         for (int i = 0; i < groups.size(); i++) {
813             for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
814                 seqGroup[*itNames] = i;
815                 groups[i].erase(itNames++);
816             }
817         }
818         
819                 splitNames(seqGroup, numGroups, tempDistFiles);
820                                 
821                 return 0;                       
822         }
823         catch(exception& e) {
824                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
825                 exit(1);
826         }
827 }
828 //********************************************************************************************************************
829 //sorts biggest to smallest
830 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
831         
832         FILE * pFile;
833         long leftsize = 0;
834                 
835         //get num bytes in file
836         string filename = left.begin()->first;
837         pFile = fopen (filename.c_str(),"rb");
838         string error = "Error opening " + filename;
839         if (pFile==NULL) perror (error.c_str());
840         else{
841                 fseek (pFile, 0, SEEK_END);
842                 leftsize=ftell (pFile);
843                 fclose (pFile);
844         }
845
846         FILE * pFile2;
847         long rightsize = 0;
848                 
849         //get num bytes in file
850         filename = right.begin()->first;
851         pFile2 = fopen (filename.c_str(),"rb");
852         error = "Error opening " + filename;
853         if (pFile2==NULL) perror (error.c_str());
854         else{
855                 fseek (pFile2, 0, SEEK_END);
856                 rightsize=ftell (pFile2);
857                 fclose (pFile2);
858         }
859
860         return (leftsize > rightsize);  
861
862 /***********************************************************************/
863 //returns map of distance files -> namefile sorted by distance file size
864 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
865         try {   
866                 
867                 sort(dists.begin(), dists.end(), compareFileSizes);
868                 
869                 return dists;
870         }
871         catch(exception& e) {
872                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
873                 exit(1);
874         }
875 }
876 /***********************************************************************/
877 SplitMatrix::~SplitMatrix(){}
878 /***********************************************************************/
879