]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
fixed bug in cluster.split with classify method
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "sequencedb.h"
13 #include "onegapdist.h"
14 #include "dist.h"
15
16 /***********************************************************************/
17
18 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
19         m = MothurOut::getInstance();
20         distFile = distfile;
21         cutoff = c;
22         namefile = name;
23         method = t;
24         taxFile = tax;
25         large = l;
26 }
27 /***********************************************************************/
28
29 SplitMatrix::SplitMatrix(string ffile, string tax, float c, string t){
30         m = MothurOut::getInstance();
31         fastafile = ffile;
32         taxFile = tax;
33         cutoff = c;
34         method = t;
35 }
36
37 /***********************************************************************/
38
39 int SplitMatrix::split(){
40         try {
41         
42                 if (method == "distance") {  
43                         splitDistance();
44                 }else if ((method == "classify") || (method == "fasta")) {
45                         splitClassify();
46                 }else {
47                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
48                         map<string, string> temp;
49                         temp[distFile] = namefile;
50                         dists.push_back(temp);
51                 }
52                 
53                 return 0;
54         }
55         catch(exception& e) {
56                 m->errorOut(e, "SplitMatrix", "split");
57                 exit(1);
58         }
59 }
60 /***********************************************************************/
61 int SplitMatrix::splitDistance(){
62         try {
63         
64                 if (large)      { splitDistanceLarge(); }
65                 else            { splitDistanceRAM();   }
66                         
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "SplitMatrix", "splitDistance");
70                 exit(1);
71         }
72 }
73
74 /***********************************************************************/
75 int SplitMatrix::splitClassify(){
76         try {
77                 cutoff = int(cutoff);
78                 
79                 map<string, int> seqGroup;
80                 map<string, int>::iterator it;
81                 map<string, int>::iterator it2;
82                 
83                 int numGroups = 0;
84                 
85                 //build tree from users taxonomy file
86                 PhyloTree* phylo = new PhyloTree();
87                 
88                 ifstream in;
89                 openInputFile(taxFile, in);
90                         
91                 //read in users taxonomy file and add sequences to tree
92                 string seqname, tax;
93                 while(!in.eof()){
94                         in >> seqname >> tax; gobble(in);
95                         phylo->addSeqToTree(seqname, tax);
96                 }
97                 in.close();
98                 
99                 phylo->assignHeirarchyIDs(0);
100
101                 //make sure the cutoff is not greater than maxlevel
102                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
103         
104                 //for each node in tree
105                 for (int i = 0; i < phylo->getNumNodes(); i++) {
106                 
107                         //is this node within the cutoff
108                         TaxNode taxon = phylo->get(i);
109         
110                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
111                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
112                                         for (int j = 0; j < taxon.accessions.size(); j++) {
113                                                 seqGroup[taxon.accessions[j]] = numGroups;
114                                         }
115                                         numGroups++;
116                                 }
117                         }
118                 }
119         
120                 delete phylo;
121                 
122                 if (method == "classify") {
123                         splitDistanceFileByTax(seqGroup, numGroups);
124                 }else {
125                         createDistanceFilesFromTax(seqGroup, numGroups);
126                 }
127                 
128                                 
129                 return 0;
130                         
131         }
132         catch(exception& e) {
133                 m->errorOut(e, "SplitMatrix", "splitClassify");
134                 exit(1);
135         }
136 }
137 /***********************************************************************/
138 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
139         try {
140                 map<string, int>::iterator it;
141                 map<string, int>::iterator it2;
142                 map<string, int> seqIndexInFasta;
143                                 
144                 //read fastafile
145                 SequenceDB alignDB;
146                 
147                 ifstream filehandle;
148                 openInputFile(fastafile, filehandle);
149                 int numSeqs = 0;
150                 while (!filehandle.eof()) {
151                         //input sequence info into sequencedb
152                         Sequence newSequence(filehandle);
153                         
154                         if (newSequence.getName() != "") {   
155                                 alignDB.push_back(newSequence);  
156                                 seqIndexInFasta[newSequence.getName()] = numSeqs;
157                                 numSeqs++;
158                         }
159                         
160                         //takes care of white space
161                         gobble(filehandle);
162                 }
163                 filehandle.close();
164                 
165                 Dist* distCalculator = new oneGapDist();
166                 
167                 
168 //still not done....            
169                 
170                 
171                 return 0;
172         }
173         catch(exception& e) {
174                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
175                 exit(1);
176         }
177 }
178 /***********************************************************************/
179 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
180         try {
181                 map<string, int>::iterator it;
182                 map<string, int>::iterator it2;
183                 
184                 ifstream dFile;
185                 openInputFile(distFile, dFile);
186                 ofstream outFile;
187                 
188                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
189                         remove((distFile + "." + toString(i) + ".temp").c_str());
190                 }
191                 
192                 //for buffering the io to improve speed
193                  //allow for 10 dists to be stored, then output.
194                 vector<string> outputs;  outputs.resize(numGroups, "");
195                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
196                 
197                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
198                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
199                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
200                 
201                 //for each distance
202                 while(dFile){
203                         string seqA, seqB;
204                         float dist;
205                         
206                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
207                         
208                         dFile >> seqA >> seqB >> dist;  gobble(dFile);
209                         
210                         //if both sequences are in the same group then they are within the cutoff
211                         it = seqGroup.find(seqA);
212                         it2 = seqGroup.find(seqB);
213                         
214                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
215                                 if (it->second == it2->second) { //they are from the same group so add the distance
216                                         if (numOutputs[it->second] > 30) {
217                                                 openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
218                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
219                                                 outFile.close();
220                                                 outputs[it->second] = "";
221                                                 numOutputs[it->second] = 0;
222                                                 validDistances[it->second] = true;
223                                         }else{
224                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
225                                                 numOutputs[it->second]++;
226                                         }
227                                 }
228                         }
229                 }
230                 dFile.close();
231         
232                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
233                         remove((namefile + "." + toString(i) + ".temp").c_str());
234                         
235                         //write out any remaining buffers
236                         if (numOutputs[i] > 0) {
237                                 openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
238                                 outFile << outputs[i];
239                                 outFile.close();
240                                 outputs[i] = "";
241                                 numOutputs[i] = 0;
242                                 validDistances[i] = true;
243                         }
244                 }
245                 
246                 ifstream bigNameFile;
247                 openInputFile(namefile, bigNameFile);
248                 
249                 singleton = namefile + ".extra.temp";
250                 ofstream remainingNames;
251                 openOutputFile(singleton, remainingNames);
252                 
253                 bool wroteExtra = false;
254                                                 
255                 string name, nameList;
256                 while(!bigNameFile.eof()){
257                         bigNameFile >> name >> nameList;  gobble(bigNameFile);
258                         
259                         //did this sequence get assigned a group
260                         it = seqGroup.find(name);
261                         
262                         if (it != seqGroup.end()) {  
263                                 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
264                                 outFile << name << '\t' << nameList << endl;
265                                 outFile.close();
266                         }else{
267                                 wroteExtra = true;
268                                 remainingNames << name << '\t' << nameList << endl;
269                         }
270                 }
271                 bigNameFile.close();
272                 remainingNames.close();
273                 
274                 if (!wroteExtra) { 
275                         remove(singleton.c_str());
276                         singleton = "none";
277                 }
278                 
279                 for(int i=0;i<numGroups;i++){
280                         //if there are valid distances
281                         if (validDistances[i]) {
282                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
283                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
284                                 
285                                 map<string, string> temp;
286                                 temp[tempDistFile] = tempNameFile;
287                                 dists.push_back(temp);
288                         }
289                 }
290                 
291                 if (m->control_pressed)  {  
292                         for (int i = 0; i < dists.size(); i++) { 
293                                 remove((dists[i].begin()->first).c_str());
294                                 remove((dists[i].begin()->second).c_str());
295                         }
296                         dists.clear();
297                 }
298                 
299                 return 0;
300         }
301         catch(exception& e) {
302                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
303                 exit(1);
304         }
305 }
306 /***********************************************************************/
307 int SplitMatrix::splitDistanceLarge(){
308         try {
309                 vector<set<string> > groups;
310                 
311                 //for buffering the io to improve speed
312                  //allow for 30 dists to be stored, then output.
313                 vector<string> outputs;
314                 vector<int> numOutputs;
315                 vector<bool> wroteOutPut;
316                 
317                 int numGroups = 0;
318
319                 ofstream outFile;
320                 ifstream dFile;
321                 openInputFile(distFile, dFile);
322         
323                 while(dFile){
324                         string seqA, seqB;
325                         float dist;
326
327                         dFile >> seqA >> seqB >> dist;
328                         
329                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
330                                         
331                         if(dist < cutoff){
332                                 //cout << "in cutoff: " << dist << endl;
333                                 int groupIDA = -1;
334                                 int groupIDB = -1;
335                                 int groupID = -1;
336                                 
337                                 for(int i=0;i<numGroups;i++){
338                                         set<string>::iterator aIt = groups[i].find(seqA);
339                                         set<string>::iterator bIt = groups[i].find(seqB);
340                                         
341                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
342                                                 groups[i].insert(seqB);
343                                                 groupIDA = i;
344                                                 groupID = groupIDA;
345
346                                                 //cout << "in aIt: " << groupID << endl;
347         //                                      break;
348                                         }
349                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
350                                                 groups[i].insert(seqA);
351                                                 groupIDB = i;
352                                                 groupID = groupIDB;
353
354                                         //      cout << "in bIt: " << groupID << endl;
355         //                                      break;
356                                         }
357                                 
358                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
359                                                 if(groupIDA < groupIDB){
360                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
361                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
362                                                         groups[groupIDB].clear(); 
363                                                         groupID = groupIDA;
364                                                 }
365                                                 else{
366                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
367                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
368                                                         groups[groupIDA].clear();  
369                                                         groupID = groupIDB;
370                                                 }
371                                                 break;
372                                         }
373                                 }
374                                 
375         //windows is gonna gag on the reuse of outFile, will need to make it local...
376                                 
377                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
378                                         set<string> newGroup;
379                                         newGroup.insert(seqA);
380                                         newGroup.insert(seqB);
381                                         groups.push_back(newGroup);
382                                                                         
383                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
384                                         outputs.push_back(tempOut);
385                                         numOutputs.push_back(1);
386                                         wroteOutPut.push_back(false);
387                                         
388                                         numGroups++;
389                                 }
390                                 else{
391                                         string fileName = distFile + "." + toString(groupID) + ".temp";
392                                                                                         
393                                         //have we reached the max buffer size
394                                         if (numOutputs[groupID] > 60) { //write out sequence
395                                                 outFile.open(fileName.c_str(), ios::app);
396                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
397                                                 outFile.close();
398                                                 
399                                                 outputs[groupID] = "";
400                                                 numOutputs[groupID] = 0;
401                                                 wroteOutPut[groupID] = true;
402                                         }else {
403                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
404                                                 numOutputs[groupID]++;
405                                         }
406                                         
407                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
408                                                 string row, column, distance;
409                                                 if(groupIDA<groupIDB){
410                                                         
411                                                         //merge memory
412                                                         numOutputs[groupID] += numOutputs[groupIDB];
413                                                         outputs[groupID] += outputs[groupIDB];
414                                                         
415                                                         outputs[groupIDB] = "";
416                                                         numOutputs[groupIDB] = 0;
417                                                         
418                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
419                                                         if (wroteOutPut[groupIDB]) {
420                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
421                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
422                                                                 
423                                                                 outFile.open(fileName.c_str(), ios::app);
424                                                                 
425                                                                 long size;
426                                                                 char* memblock;
427
428                                                                 size = fileB.tellg();
429                                 
430                                                                 fileB.seekg (0, ios::beg);
431                                                                 
432                                                                 int numRead = size / 1024;
433                                                                 int lastRead = size % 1024;
434
435                                                                 for (int i = 0; i < numRead; i++) {
436                                 
437                                                                         memblock = new char [1024];
438                                                                 
439                                                                         fileB.read (memblock, 1024);
440                                                                         
441                                                                         string temp = memblock;
442                                                                         outFile << temp.substr(0, 1024);
443                                                                         
444                                                                         delete memblock;
445                                                                 }
446                                                                 
447                                                                 memblock = new char [lastRead];
448                                                                 
449                                                                 fileB.read (memblock, lastRead);
450                                                                 
451                                                                 //not sure why but it will read more than lastRead char...??
452                                                                 string temp = memblock;
453                                                                 outFile << temp.substr(0, lastRead);
454                                                                 delete memblock;
455                                                                 
456                                                                 fileB.close();
457                                                                 remove(fileName2.c_str());
458                                                                 
459                                                                 //write out the merged memory
460                                                                 if (numOutputs[groupID] > 60) {
461                                                                         outFile << outputs[groupID];
462                                                                         outputs[groupID] = "";
463                                                                         numOutputs[groupID] = 0;
464                                                                 }
465                                                                 
466                                                                 outFile.close();
467                                                                 
468                                                                 wroteOutPut[groupID] = true;
469                                                                 wroteOutPut[groupIDB] = false;
470                                                         }else{ } //just merge b's memory with a's memory 
471                                                 }
472                                                 else{
473                                                         numOutputs[groupID] += numOutputs[groupIDA];
474                                                         outputs[groupID] += outputs[groupIDA];
475                                                         
476                                                         outputs[groupIDA] = "";
477                                                         numOutputs[groupIDA] = 0;
478                                                         
479                                                         if (wroteOutPut[groupIDA]) {
480                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
481                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
482                                                                 
483                                                                 outFile.open(fileName.c_str(), ios::app);
484                                                                 
485                                                                 long size;
486                                                                 char* memblock;
487
488                                                                 size = fileB.tellg();
489                                                                                                                         
490                                                                 fileB.seekg (0, ios::beg);
491                                                                 
492                                                                 int numRead = size / 1024;
493                                                                 int lastRead = size % 1024;
494
495                                                                 for (int i = 0; i < numRead; i++) {
496                                 
497                                                                         memblock = new char [1024];
498                                                                 
499                                                                         fileB.read (memblock, 1024);
500                                                                         string temp = memblock;
501                                                                         outFile << temp.substr(0, 1024);
502                                                                         
503                                                                         delete memblock;
504                                                                 }
505                                                                 
506                                                                 memblock = new char [lastRead];
507                                                                 
508                                                                 fileB.read (memblock, lastRead);
509                                                                 
510                                                                 //not sure why but it will read more than lastRead char...??
511                                                                 string temp = memblock;
512                                                                 outFile << temp.substr(0, lastRead);
513                                                                         
514                                                                 delete memblock;
515                                                                 
516                                                                 fileB.close();
517                                                                 remove(fileName2.c_str());
518                                                                 
519                                                                 //write out the merged memory
520                                                                 if (numOutputs[groupID] > 60) {
521                                                                         outFile << outputs[groupID];
522                                                                         outputs[groupID] = "";
523                                                                         numOutputs[groupID] = 0;
524                                                                 }
525                                                                 
526                                                                 outFile.close();
527                                                                 
528                                                                 wroteOutPut[groupID] = true;
529                                                                 wroteOutPut[groupIDA] = false;
530                                                         }else { } //just merge memory
531                                                 }                                       
532                                         }
533                                 }
534                         }
535                         gobble(dFile);
536                 }
537                 dFile.close();
538                 
539                 for (int i = 0; i < numGroups; i++) {
540                         if (numOutputs[i] > 0) {
541                                 string fileName = distFile + "." + toString(i) + ".temp";
542                                 outFile.open(fileName.c_str(), ios::app);
543                                 outFile << outputs[i];
544                                 outFile.close();
545                         }
546                 }
547
548                 splitNames(groups);
549                                 
550                 return 0;                       
551         }
552         catch(exception& e) {
553                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
554                 exit(1);
555         }
556 }
557 //********************************************************************************************************************
558 int SplitMatrix::splitNames(vector<set<string> >& groups){
559         try {
560                 int numGroups = groups.size();
561         
562                 ifstream bigNameFile(namefile.c_str());
563                 if(!bigNameFile){
564                         cerr << "Error: We can't open the name file\n";
565                         exit(1);
566                 }
567                 
568                 map<string, string> nameMap;
569                 string name, nameList;
570                 while(bigNameFile){
571                         bigNameFile >> name >> nameList;
572                         nameMap[name] = nameList;
573                         gobble(bigNameFile);
574                 }
575                 bigNameFile.close();
576                         
577                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
578                         int numSeqsInGroup = groups[i].size();
579                         
580                         if(numSeqsInGroup > 0){
581                                 string fileName = namefile + "." + toString(i) + ".temp";
582                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
583                                 
584                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
585                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
586                                         if (nIt != nameMap.end()) {
587                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
588                                                 nameMap.erase(nIt);
589                                         }else{
590                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
591                                         }
592                                 }
593                                 smallNameFile.close();
594                         }
595                 }
596                 
597                 //names of singletons
598                 if (nameMap.size() != 0) {
599                         singleton = namefile + ".extra.temp";
600                         ofstream remainingNames(singleton.c_str(), ios::ate);
601                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
602                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
603                         }
604                         remainingNames.close();
605                 }else { singleton = "none"; }
606                         
607                 for(int i=0;i<numGroups;i++){
608                         if(groups[i].size() > 0){
609                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
610                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
611                                 
612                                 map<string, string> temp;
613                                 temp[tempDistFile] = tempNameFile;
614                                 dists.push_back(temp);
615                         }
616                 }
617                 
618                 if (m->control_pressed)  {  
619                         for (int i = 0; i < dists.size(); i++) { 
620                                 remove((dists[i].begin()->first).c_str());
621                                 remove((dists[i].begin()->second).c_str());
622                         }
623                         dists.clear();
624                 }
625                 
626                 return 0;
627         }
628         catch(exception& e) {
629                 m->errorOut(e, "SplitMatrix", "splitNames");
630                 exit(1);
631         }
632 }
633 //********************************************************************************************************************
634 int SplitMatrix::splitDistanceRAM(){
635         try {
636                 vector<set<string> > groups;
637                 vector<string> outputs;
638                 
639                 int numGroups = 0;
640
641                 ifstream dFile;
642                 openInputFile(distFile, dFile);
643
644                 while(dFile){
645                         string seqA, seqB;
646                         float dist;
647
648                         dFile >> seqA >> seqB >> dist;
649                         
650                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
651                                         
652                         if(dist < cutoff){
653                                 //cout << "in cutoff: " << dist << endl;
654                                 int groupIDA = -1;
655                                 int groupIDB = -1;
656                                 int groupID = -1;
657                                 
658                                 for(int i=0;i<numGroups;i++){
659                                         set<string>::iterator aIt = groups[i].find(seqA);
660                                         set<string>::iterator bIt = groups[i].find(seqB);
661                                         
662                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
663                                                 groups[i].insert(seqB);
664                                                 groupIDA = i;
665                                                 groupID = groupIDA;
666
667                                                 //cout << "in aIt: " << groupID << endl;
668         //                                      break;
669                                         }
670                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
671                                                 groups[i].insert(seqA);
672                                                 groupIDB = i;
673                                                 groupID = groupIDB;
674
675                                         //      cout << "in bIt: " << groupID << endl;
676         //                                      break;
677                                         }
678                                 
679                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
680                                                 if(groupIDA < groupIDB){
681                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
682                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
683                                                         groups[groupIDB].clear(); 
684                                                         groupID = groupIDA;
685                                                 }
686                                                 else{
687                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
688                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
689                                                         groups[groupIDA].clear();  
690                                                         groupID = groupIDB;
691                                                 }
692                                                 break;
693                                         }
694                                 }
695                                 
696         //windows is gonna gag on the reuse of outFile, will need to make it local...
697                                 
698                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
699                                         set<string> newGroup;
700                                         newGroup.insert(seqA);
701                                         newGroup.insert(seqB);
702                                         groups.push_back(newGroup);
703                                                                         
704                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
705                                         outputs.push_back(tempOut);
706                                         numGroups++;
707                                 }
708                                 else{
709                                                                                         
710                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
711                                         
712                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
713                                                 string row, column, distance;
714                                                 if(groupIDA<groupIDB){
715                                                         //merge memory
716                                                         outputs[groupID] += outputs[groupIDB];
717                                                         outputs[groupIDB] = "";
718                                                 }else{
719                                                         outputs[groupID] += outputs[groupIDA];
720                                                         outputs[groupIDA] = "";
721                                                 }                                       
722                                         }
723                                 }
724                         }
725                         gobble(dFile);
726                 }
727                 dFile.close();
728                 
729                 for (int i = 0; i < numGroups; i++) {
730                         if (outputs[i] != "") {
731                                 ofstream outFile;
732                                 string fileName = distFile + "." + toString(i) + ".temp";
733                                 outFile.open(fileName.c_str(), ios::ate);
734                                 outFile << outputs[i];
735                                 outFile.close();
736                         }
737                 }
738
739                 splitNames(groups);
740                                 
741                 return 0;                       
742         }
743         catch(exception& e) {
744                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
745                 exit(1);
746         }
747 }
748 //********************************************************************************************************************
749 //sorts biggest to smallest
750 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
751         
752         FILE * pFile;
753         long leftsize = 0;
754                 
755         //get num bytes in file
756         string filename = left.begin()->first;
757         pFile = fopen (filename.c_str(),"rb");
758         string error = "Error opening " + filename;
759         if (pFile==NULL) perror (error.c_str());
760         else{
761                 fseek (pFile, 0, SEEK_END);
762                 leftsize=ftell (pFile);
763                 fclose (pFile);
764         }
765
766         FILE * pFile2;
767         long rightsize = 0;
768                 
769         //get num bytes in file
770         filename = right.begin()->first;
771         pFile2 = fopen (filename.c_str(),"rb");
772         error = "Error opening " + filename;
773         if (pFile2==NULL) perror (error.c_str());
774         else{
775                 fseek (pFile2, 0, SEEK_END);
776                 rightsize=ftell (pFile2);
777                 fclose (pFile2);
778         }
779
780         return (leftsize > rightsize);  
781
782 /***********************************************************************/
783 //returns map of distance files -> namefile sorted by distance file size
784 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
785         try {   
786                 
787                 sort(dists.begin(), dists.end(), compareFileSizes);
788                 
789                 return dists;
790         }
791         catch(exception& e) {
792                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
793                 exit(1);
794         }
795 }
796 /***********************************************************************/
797 SplitMatrix::~SplitMatrix(){}
798 /***********************************************************************/
799