]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
added count file to cluster.classic and cluster.split. modified splitting classes...
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
14
15 /***********************************************************************/
16
17 SplitMatrix::SplitMatrix(string distfile, string name, string count, string tax, float c, string t, bool l){
18         m = MothurOut::getInstance();
19         distFile = distfile;
20         cutoff = c;
21         namefile = name;
22         method = t;
23         taxFile = tax;
24     countfile = count;
25         large = l;
26 }
27 /***********************************************************************/
28
29 SplitMatrix::SplitMatrix(string ffile, string name, string count, string tax, float c, float cu, string t, int p, bool cl, string output){
30         m = MothurOut::getInstance();
31         fastafile = ffile;
32         namefile = name;
33     countfile = count;
34         taxFile = tax;
35         cutoff = c;  //tax level cutoff
36         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
37         method = t;
38         processors = p;
39     classic = cl;
40         outputDir = output;
41 }
42
43 /***********************************************************************/
44
45 int SplitMatrix::split(){
46         try {
47         
48                 if (method == "distance") {  
49                         splitDistance();
50                 }else if ((method == "classify") || (method == "fasta")) {
51                         splitClassify();
52                 }else {
53                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
54                         map<string, string> temp;
55                         if (namefile != "") {  temp[distFile] = namefile; }
56             else { temp[distFile] = countfile; }
57                         dists.push_back(temp);
58                 }
59                 
60                 return 0;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "SplitMatrix", "split");
64                 exit(1);
65         }
66 }
67 /***********************************************************************/
68 int SplitMatrix::splitDistance(){
69         try {
70         
71                 if (large)      { splitDistanceLarge(); }
72                 else            { splitDistanceRAM();   }
73                 
74                 return 0;
75                         
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "SplitMatrix", "splitDistance");
79                 exit(1);
80         }
81 }
82
83 /***********************************************************************/
84 int SplitMatrix::splitClassify(){
85         try {
86                 cutoff = int(cutoff);
87                                 
88                 map<string, int> seqGroup;
89                 map<string, int>::iterator it;
90                 map<string, int>::iterator it2;
91                 
92                 int numGroups = 0;
93                 
94                 //build tree from users taxonomy file
95                 PhyloTree* phylo = new PhyloTree();
96                 
97         map<string, string> temp;
98         m->readTax(taxFile, temp);
99         
100         for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
101             phylo->addSeqToTree(itTemp->first, itTemp->second);
102             temp.erase(itTemp++);
103         }
104                 
105                 phylo->assignHeirarchyIDs(0);
106
107                 //make sure the cutoff is not greater than maxlevel
108                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
109         
110                 //for each node in tree
111                 for (int i = 0; i < phylo->getNumNodes(); i++) {
112                 
113                         //is this node within the cutoff
114                         TaxNode taxon = phylo->get(i);
115         
116                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
117                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
118                                         for (int j = 0; j < taxon.accessions.size(); j++) {
119                                                 seqGroup[taxon.accessions[j]] = numGroups;
120                                         }
121                                         numGroups++;
122                                 }
123                         }
124                 }
125         
126                 delete phylo;
127                 
128                 if (method == "classify") {
129                         splitDistanceFileByTax(seqGroup, numGroups);
130                 }else {
131                         createDistanceFilesFromTax(seqGroup, numGroups);
132                 }
133                 
134                 return 0;
135                         
136         }
137         catch(exception& e) {
138                 m->errorOut(e, "SplitMatrix", "splitClassify");
139                 exit(1);
140         }
141 }
142 /***********************************************************************/
143 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
144         try {
145                 map<string, int> copyGroups = seqGroup;
146                 map<string, int>::iterator it;
147                 set<string> names;
148                                 
149                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
150                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
151                 }
152                         
153                 ifstream in;
154                 m->openInputFile(fastafile, in);
155         
156                 //parse fastafile
157                 ofstream outFile;
158                 while (!in.eof()) {
159                         Sequence query(in); m->gobble(in);
160                         if (query.getName() != "") {
161                 
162                                 it = seqGroup.find(query.getName());
163                                 
164                                 //save names in case no namefile is given
165                                 if ((namefile == "") && (countfile == "")) {  names.insert(query.getName()); }
166                         
167                                 if (it != seqGroup.end()) { //not singleton 
168                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
169                                         query.printSequence(outFile); 
170                                         outFile.close();
171                                         
172                                         copyGroups.erase(query.getName());
173                                 }
174                         }
175                 }
176                 in.close();
177                 
178                 //warn about sequence in groups that are not in fasta file
179                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
180                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
181                         exit(1);
182                 }
183                 
184                 copyGroups.clear();
185                 
186                 //process each distance file
187                 for (int i = 0; i < numGroups; i++) { 
188                         
189                         string options = "";
190             if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
191             else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
192                         if (outputDir != "") { options += ", outputdir=" + outputDir; }
193                         
194                         Command* command = new DistanceCommand(options);
195                         
196                         command->execute();
197                         delete command;
198                         
199                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
200                         
201                         //remove old names files just in case
202                         if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); }
203             else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); }
204                 }
205         
206         vector<string> tempDistFiles;    
207         for(int i=0;i<numGroups;i++){
208             if (outputDir == "") { outputDir = m->hasPath(fastafile); }
209             string tempDistFile = "";
210             if (classic) { tempDistFile =  outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
211             else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
212             tempDistFiles.push_back(tempDistFile);
213         }
214         
215         splitNames(seqGroup, numGroups, tempDistFiles);
216         
217                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
218                 
219                 return 0;
220         }
221         catch(exception& e) {
222                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
223                 exit(1);
224         }
225 }
226 /***********************************************************************/
227 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
228         try {
229                 map<string, int>::iterator it;
230                 map<string, int>::iterator it2;
231                 
232         ofstream outFile;
233                 ifstream dFile;
234                 m->openInputFile(distFile, dFile);
235                 
236                 
237                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
238                         m->mothurRemove((distFile + "." + toString(i) + ".temp"));
239                 }
240                 
241                 //for buffering the io to improve speed
242                  //allow for 10 dists to be stored, then output.
243                 vector<string> outputs;  outputs.resize(numGroups, "");
244                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
245                 
246                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
247                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
248                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
249                 
250                 //for each distance
251                 while(dFile){
252                         string seqA, seqB;
253                         float dist;
254                         
255                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp"));       } }
256                         
257                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
258                         
259                         //if both sequences are in the same group then they are within the cutoff
260                         it = seqGroup.find(seqA);
261                         it2 = seqGroup.find(seqB);
262                         
263                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
264                                 if (it->second == it2->second) { //they are from the same group so add the distance
265                                         if (numOutputs[it->second] > 30) {
266                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
267                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
268                                                 outFile.close();
269                                                 outputs[it->second] = "";
270                                                 numOutputs[it->second] = 0;
271                                                 validDistances[it->second] = true;
272                                         }else{
273                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
274                                                 numOutputs[it->second]++;
275                                         }
276                                 }
277                         }
278                 }
279                 dFile.close();
280         
281         string inputFile = namefile;
282         if (countfile != "") { inputFile = countfile; }
283         
284         vector<string> tempDistFiles;
285                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
286             string tempDistFile = distFile + "." + toString(i) + ".temp";
287             tempDistFiles.push_back(tempDistFile);
288                         m->mothurRemove((inputFile + "." + toString(i) + ".temp"));
289                         
290                         //write out any remaining buffers
291                         if (numOutputs[i] > 0) {
292                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
293                                 outFile << outputs[i];
294                                 outFile.close();
295                                 outputs[i] = "";
296                                 numOutputs[i] = 0;
297                                 validDistances[i] = true;
298                         }
299                 }
300                 
301         splitNames(seqGroup, numGroups, tempDistFiles);
302         
303                 if (m->control_pressed)  {  
304                         for (int i = 0; i < dists.size(); i++) { 
305                                 m->mothurRemove((dists[i].begin()->first));
306                                 m->mothurRemove((dists[i].begin()->second));
307                         }
308                         dists.clear();
309                 }
310                 
311                 return 0;
312         }
313         catch(exception& e) {
314                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
315                 exit(1);
316         }
317 }
318 /***********************************************************************/
319 int SplitMatrix::splitDistanceLarge(){
320         try {
321                 vector<set<string> > groups;
322                 
323                 //for buffering the io to improve speed
324                  //allow for 30 dists to be stored, then output.
325                 vector<string> outputs;
326                 vector<int> numOutputs;
327                 vector<bool> wroteOutPut;
328                 
329                 int numGroups = 0;
330
331                 ofstream outFile;
332                 ifstream dFile;
333                 m->openInputFile(distFile, dFile);
334         
335                 while(dFile){
336                         string seqA, seqB;
337                         float dist;
338
339                         dFile >> seqA >> seqB >> dist;
340                         
341                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
342                                         
343                         if(dist < cutoff){
344                                 //cout << "in cutoff: " << dist << endl;
345                                 int groupIDA = -1;
346                                 int groupIDB = -1;
347                                 int groupID = -1;
348                                 
349                                 for(int i=0;i<numGroups;i++){
350                                         set<string>::iterator aIt = groups[i].find(seqA);
351                                         set<string>::iterator bIt = groups[i].find(seqB);
352                                         
353                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
354                                                 groups[i].insert(seqB);
355                                                 groupIDA = i;
356                                                 groupID = groupIDA;
357
358                                                 //cout << "in aIt: " << groupID << endl;
359         //                                      break;
360                                         }
361                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
362                                                 groups[i].insert(seqA);
363                                                 groupIDB = i;
364                                                 groupID = groupIDB;
365
366                                         //      cout << "in bIt: " << groupID << endl;
367         //                                      break;
368                                         }
369                                 
370                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
371                                                 if(groupIDA < groupIDB){
372                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
373                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
374                                                         groups[groupIDB].clear(); 
375                                                         groupID = groupIDA;
376                                                 }
377                                                 else{
378                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
379                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
380                                                         groups[groupIDA].clear();  
381                                                         groupID = groupIDB;
382                                                 }
383                                                 break;
384                                         }
385                                 }
386                                 
387         //windows is gonna gag on the reuse of outFile, will need to make it local...
388                                 
389                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
390                                         set<string> newGroup;
391                                         newGroup.insert(seqA);
392                                         newGroup.insert(seqB);
393                                         groups.push_back(newGroup);
394                                                                         
395                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
396                                         outputs.push_back(tempOut);
397                                         numOutputs.push_back(1);
398                                         wroteOutPut.push_back(false);
399                                         
400                                         numGroups++;
401                                 }
402                                 else{
403                                         string fileName = distFile + "." + toString(groupID) + ".temp";
404                                                                                         
405                                         //have we reached the max buffer size
406                                         if (numOutputs[groupID] > 60) { //write out sequence
407                                                 outFile.open(fileName.c_str(), ios::app);
408                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
409                                                 outFile.close();
410                                                 
411                                                 outputs[groupID] = "";
412                                                 numOutputs[groupID] = 0;
413                                                 wroteOutPut[groupID] = true;
414                                         }else {
415                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
416                                                 numOutputs[groupID]++;
417                                         }
418                                         
419                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
420                                                 string row, column, distance;
421                                                 if(groupIDA<groupIDB){
422                                                         
423                                                         //merge memory
424                                                         numOutputs[groupID] += numOutputs[groupIDB];
425                                                         outputs[groupID] += outputs[groupIDB];
426                                                         
427                                                         outputs[groupIDB] = "";
428                                                         numOutputs[groupIDB] = 0;
429                                                         
430                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
431                                                         if (wroteOutPut[groupIDB]) {
432                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
433                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
434                                                                 
435                                                                 outFile.open(fileName.c_str(), ios::app);
436                                                                 
437                                                                 long size;
438                                                                 char* memblock;
439
440                                                                 size = fileB.tellg();
441                                 
442                                                                 fileB.seekg (0, ios::beg);
443                                                                 
444                                                                 int numRead = size / 1024;
445                                                                 int lastRead = size % 1024;
446
447                                                                 for (int i = 0; i < numRead; i++) {
448                                 
449                                                                         memblock = new char [1024];
450                                                                 
451                                                                         fileB.read (memblock, 1024);
452                                                                         
453                                                                         string temp = memblock;
454                                                                         outFile << temp.substr(0, 1024);
455                                                                         
456                                                                         delete memblock;
457                                                                 }
458                                                                 
459                                                                 memblock = new char [lastRead];
460                                                                 
461                                                                 fileB.read (memblock, lastRead);
462                                                                 
463                                                                 //not sure why but it will read more than lastRead char...??
464                                                                 string temp = memblock;
465                                                                 outFile << temp.substr(0, lastRead);
466                                                                 delete memblock;
467                                                                 
468                                                                 fileB.close();
469                                                                 m->mothurRemove(fileName2);
470                                                                 
471                                                                 //write out the merged memory
472                                                                 if (numOutputs[groupID] > 60) {
473                                                                         outFile << outputs[groupID];
474                                                                         outputs[groupID] = "";
475                                                                         numOutputs[groupID] = 0;
476                                                                 }
477                                                                 
478                                                                 outFile.close();
479                                                                 
480                                                                 wroteOutPut[groupID] = true;
481                                                                 wroteOutPut[groupIDB] = false;
482                                                         }else{ } //just merge b's memory with a's memory 
483                                                 }
484                                                 else{
485                                                         numOutputs[groupID] += numOutputs[groupIDA];
486                                                         outputs[groupID] += outputs[groupIDA];
487                                                         
488                                                         outputs[groupIDA] = "";
489                                                         numOutputs[groupIDA] = 0;
490                                                         
491                                                         if (wroteOutPut[groupIDA]) {
492                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
493                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
494                                                                 
495                                                                 outFile.open(fileName.c_str(), ios::app);
496                                                                 
497                                                                 long size;
498                                                                 char* memblock;
499
500                                                                 size = fileB.tellg();
501                                                                                                                         
502                                                                 fileB.seekg (0, ios::beg);
503                                                                 
504                                                                 int numRead = size / 1024;
505                                                                 int lastRead = size % 1024;
506
507                                                                 for (int i = 0; i < numRead; i++) {
508                                 
509                                                                         memblock = new char [1024];
510                                                                 
511                                                                         fileB.read (memblock, 1024);
512                                                                         string temp = memblock;
513                                                                         outFile << temp.substr(0, 1024);
514                                                                         
515                                                                         delete memblock;
516                                                                 }
517                                                                 
518                                                                 memblock = new char [lastRead];
519                                                                 
520                                                                 fileB.read (memblock, lastRead);
521                                                                 
522                                                                 //not sure why but it will read more than lastRead char...??
523                                                                 string temp = memblock;
524                                                                 outFile << temp.substr(0, lastRead);
525                                                                         
526                                                                 delete memblock;
527                                                                 
528                                                                 fileB.close();
529                                                                 m->mothurRemove(fileName2);
530                                                                 
531                                                                 //write out the merged memory
532                                                                 if (numOutputs[groupID] > 60) {
533                                                                         outFile << outputs[groupID];
534                                                                         outputs[groupID] = "";
535                                                                         numOutputs[groupID] = 0;
536                                                                 }
537                                                                 
538                                                                 outFile.close();
539                                                                 
540                                                                 wroteOutPut[groupID] = true;
541                                                                 wroteOutPut[groupIDA] = false;
542                                                         }else { } //just merge memory
543                                                 }                                       
544                                         }
545                                 }
546                         }
547                         m->gobble(dFile);
548                 }
549                 dFile.close();
550         
551                 vector<string> tempDistFiles;
552                 for (int i = 0; i < numGroups; i++) {
553             string fileName = distFile + "." + toString(i) + ".temp";
554             tempDistFiles.push_back(fileName);
555             //remove old names files just in case
556                         
557                         if (numOutputs[i] > 0) {
558                                 outFile.open(fileName.c_str(), ios::app);
559                                 outFile << outputs[i];
560                                 outFile.close();
561                         }
562                 }
563         
564         map<string, int> seqGroup;
565         for (int i = 0; i < groups.size(); i++) {
566             for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
567                 seqGroup[*itNames] = i;
568                 groups[i].erase(itNames++);
569             }
570         }
571         
572                 splitNames(seqGroup, numGroups, tempDistFiles);
573                                 
574                 return 0;                       
575         }
576         catch(exception& e) {
577                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
578                 exit(1);
579         }
580 }
581 //********************************************************************************************************************
582 int SplitMatrix::splitNames(map<string, int>& seqGroup, int numGroups, vector<string>& tempDistFiles){
583         try {
584         ofstream outFile;
585         map<string, int>::iterator it;
586         
587         string inputFile = namefile;
588         if (countfile != "") { inputFile = countfile; }
589         
590         for(int i=0;i<numGroups;i++){  m->mothurRemove((inputFile + "." + toString(i) + ".temp")); }
591
592         singleton = inputFile + ".extra.temp";
593         ofstream remainingNames;
594         m->openOutputFile(singleton, remainingNames);
595         
596         bool wroteExtra = false;
597         
598         ifstream bigNameFile;
599         m->openInputFile(inputFile, bigNameFile);
600         
601         //grab header line 
602         string headers = "";
603         if (countfile != "") { headers = m->getline(bigNameFile); m->gobble(bigNameFile); }
604         
605         string name, nameList;
606         while(!bigNameFile.eof()){
607             bigNameFile >> name >> nameList;  
608             m->getline(bigNameFile); m->gobble(bigNameFile); //extra getline is for rest of countfile line if groups are given.
609             
610             //did this sequence get assigned a group
611             it = seqGroup.find(name);
612             
613             if (it != seqGroup.end()) {  
614                 m->openOutputFileAppend((inputFile + "." + toString(it->second) + ".temp"), outFile);
615                 outFile << name << '\t' << nameList << endl;
616                 outFile.close();
617             }else{
618                 wroteExtra = true;
619                 remainingNames << name << '\t' << nameList << endl;
620             }
621         }
622         bigNameFile.close();
623         
624                 for(int i=0;i<numGroups;i++){
625                         string tempNameFile = inputFile + "." + toString(i) + ".temp";
626                         string tempDistFile = tempDistFiles[i];
627             
628             //if there are valid distances
629             ifstream fileHandle;
630             fileHandle.open(tempDistFile.c_str());
631             if(fileHandle)      {       
632                 m->gobble(fileHandle);
633                 if (!fileHandle.eof()) {  //check
634                                 map<string, string> temp;
635                 if (countfile != "") {
636                     //add header
637                     ofstream out;
638                     string newtempNameFile = tempNameFile + "2";
639                     m->openOutputFile(newtempNameFile, out);
640                     out << headers << endl;
641                     out.close();
642                     m->appendFiles(tempNameFile, newtempNameFile);
643                     m->mothurRemove(tempNameFile);
644                     m->renameFile(newtempNameFile, tempNameFile);
645                 }
646                                 temp[tempDistFile] = tempNameFile;
647                                 dists.push_back(temp);
648                         }else{
649                                 ifstream in;
650                                 m->openInputFile(tempNameFile, in);
651                                 
652                                 while(!in.eof()) { 
653                                         in >> name >> nameList;  m->gobble(in);
654                                         wroteExtra = true;
655                                         remainingNames << name << '\t' << nameList << endl;
656                                 }
657                                 in.close();
658                                 m->mothurRemove(tempNameFile);
659                         }
660             }
661             fileHandle.close();
662                 }
663                 
664                 remainingNames.close();
665                 
666                 if (!wroteExtra) { 
667                         m->mothurRemove(singleton);
668                         singleton = "none";
669                 }else if (countfile != "") {
670             //add header
671             ofstream out;
672             string newtempNameFile = singleton + "2";
673             m->openOutputFile(newtempNameFile, out);
674             out << headers << endl;
675             out.close();
676             m->appendFiles(singleton, newtempNameFile);
677             m->mothurRemove(singleton);
678             m->renameFile(newtempNameFile, singleton);
679         }
680                 
681                 return 0;
682         }
683         catch(exception& e) {
684                 m->errorOut(e, "SplitMatrix", "splitNames");
685                 exit(1);
686         }
687 }
688 //********************************************************************************************************************
689 int SplitMatrix::splitDistanceRAM(){
690         try {
691                 vector<set<string> > groups;
692                 vector<string> outputs;
693                 
694                 int numGroups = 0;
695
696                 ifstream dFile;
697                 m->openInputFile(distFile, dFile);
698
699                 while(dFile){
700                         string seqA, seqB;
701                         float dist;
702
703                         dFile >> seqA >> seqB >> dist;
704                         
705                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
706                                         
707                         if(dist < cutoff){
708                                 //cout << "in cutoff: " << dist << endl;
709                                 int groupIDA = -1;
710                                 int groupIDB = -1;
711                                 int groupID = -1;
712                                 
713                                 for(int i=0;i<numGroups;i++){
714                                         set<string>::iterator aIt = groups[i].find(seqA);
715                                         set<string>::iterator bIt = groups[i].find(seqB);
716                                         
717                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
718                                                 groups[i].insert(seqB);
719                                                 groupIDA = i;
720                                                 groupID = groupIDA;
721
722                                                 //cout << "in aIt: " << groupID << endl;
723         //                                      break;
724                                         }
725                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
726                                                 groups[i].insert(seqA);
727                                                 groupIDB = i;
728                                                 groupID = groupIDB;
729
730                                         //      cout << "in bIt: " << groupID << endl;
731         //                                      break;
732                                         }
733                                 
734                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
735                                                 if(groupIDA < groupIDB){
736                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
737                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
738                                                         groups[groupIDB].clear(); 
739                                                         groupID = groupIDA;
740                                                 }
741                                                 else{
742                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
743                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
744                                                         groups[groupIDA].clear();  
745                                                         groupID = groupIDB;
746                                                 }
747                                                 break;
748                                         }
749                                 }
750                                 
751         //windows is gonna gag on the reuse of outFile, will need to make it local...
752                                 
753                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
754                                         set<string> newGroup;
755                                         newGroup.insert(seqA);
756                                         newGroup.insert(seqB);
757                                         groups.push_back(newGroup);
758                                                                         
759                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
760                                         outputs.push_back(tempOut);
761                                         numGroups++;
762                                 }
763                                 else{
764                                                                                         
765                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
766                                         
767                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
768                                                 string row, column, distance;
769                                                 if(groupIDA<groupIDB){
770                                                         //merge memory
771                                                         outputs[groupID] += outputs[groupIDB];
772                                                         outputs[groupIDB] = "";
773                                                 }else{
774                                                         outputs[groupID] += outputs[groupIDA];
775                                                         outputs[groupIDA] = "";
776                                                 }                                       
777                                         }
778                                 }
779                         }
780                         m->gobble(dFile);
781                 }
782                 dFile.close();
783                 
784         vector<string> tempDistFiles;
785                 for (int i = 0; i < numGroups; i++) {
786             string fileName = distFile + "." + toString(i) + ".temp";
787             tempDistFiles.push_back(fileName);
788                         if (outputs[i] != "") {
789                                 ofstream outFile;
790                                 outFile.open(fileName.c_str(), ios::ate);
791                                 outFile << outputs[i];
792                                 outFile.close();
793                         }
794                 }
795         
796         map<string, int> seqGroup;
797         for (int i = 0; i < groups.size(); i++) {
798             for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
799                 seqGroup[*itNames] = i;
800                 groups[i].erase(itNames++);
801             }
802         }
803         
804                 splitNames(seqGroup, numGroups, tempDistFiles);
805                                 
806                 return 0;                       
807         }
808         catch(exception& e) {
809                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
810                 exit(1);
811         }
812 }
813 //********************************************************************************************************************
814 //sorts biggest to smallest
815 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
816         
817         FILE * pFile;
818         long leftsize = 0;
819                 
820         //get num bytes in file
821         string filename = left.begin()->first;
822         pFile = fopen (filename.c_str(),"rb");
823         string error = "Error opening " + filename;
824         if (pFile==NULL) perror (error.c_str());
825         else{
826                 fseek (pFile, 0, SEEK_END);
827                 leftsize=ftell (pFile);
828                 fclose (pFile);
829         }
830
831         FILE * pFile2;
832         long rightsize = 0;
833                 
834         //get num bytes in file
835         filename = right.begin()->first;
836         pFile2 = fopen (filename.c_str(),"rb");
837         error = "Error opening " + filename;
838         if (pFile2==NULL) perror (error.c_str());
839         else{
840                 fseek (pFile2, 0, SEEK_END);
841                 rightsize=ftell (pFile2);
842                 fclose (pFile2);
843         }
844
845         return (leftsize > rightsize);  
846
847 /***********************************************************************/
848 //returns map of distance files -> namefile sorted by distance file size
849 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
850         try {   
851                 
852                 sort(dists.begin(), dists.end(), compareFileSizes);
853                 
854                 return dists;
855         }
856         catch(exception& e) {
857                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
858                 exit(1);
859         }
860 }
861 /***********************************************************************/
862 SplitMatrix::~SplitMatrix(){}
863 /***********************************************************************/
864