]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
added fasta method to cluster.split
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13
14 /***********************************************************************/
15
16 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
17         m = MothurOut::getInstance();
18         distFile = distfile;
19         cutoff = c;
20         namefile = name;
21         method = t;
22         taxFile = tax;
23         large = l;
24 }
25 /***********************************************************************/
26
27 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, string t, int p, string output){
28         m = MothurOut::getInstance();
29         fastafile = ffile;
30         namefile = name;
31         taxFile = tax;
32         cutoff = c;
33         method = t;
34         processors = p;
35         outputDir = output;
36 }
37
38 /***********************************************************************/
39
40 int SplitMatrix::split(){
41         try {
42         
43                 if (method == "distance") {  
44                         splitDistance();
45                 }else if ((method == "classify") || (method == "fasta")) {
46                         splitClassify();
47                 }else {
48                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
49                         map<string, string> temp;
50                         temp[distFile] = namefile;
51                         dists.push_back(temp);
52                 }
53                 
54                 return 0;
55         }
56         catch(exception& e) {
57                 m->errorOut(e, "SplitMatrix", "split");
58                 exit(1);
59         }
60 }
61 /***********************************************************************/
62 int SplitMatrix::splitDistance(){
63         try {
64         
65                 if (large)      { splitDistanceLarge(); }
66                 else            { splitDistanceRAM();   }
67                         
68         }
69         catch(exception& e) {
70                 m->errorOut(e, "SplitMatrix", "splitDistance");
71                 exit(1);
72         }
73 }
74
75 /***********************************************************************/
76 int SplitMatrix::splitClassify(){
77         try {
78                 cutoff = int(cutoff);
79                 
80                 map<string, int> seqGroup;
81                 map<string, int>::iterator it;
82                 map<string, int>::iterator it2;
83                 
84                 int numGroups = 0;
85                 
86                 //build tree from users taxonomy file
87                 PhyloTree* phylo = new PhyloTree();
88                 
89                 ifstream in;
90                 openInputFile(taxFile, in);
91                         
92                 //read in users taxonomy file and add sequences to tree
93                 string seqname, tax;
94                 while(!in.eof()){
95                         in >> seqname >> tax; gobble(in);
96                         phylo->addSeqToTree(seqname, tax);
97                 }
98                 in.close();
99                 
100                 phylo->assignHeirarchyIDs(0);
101
102                 //make sure the cutoff is not greater than maxlevel
103                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
104         
105                 //for each node in tree
106                 for (int i = 0; i < phylo->getNumNodes(); i++) {
107                 
108                         //is this node within the cutoff
109                         TaxNode taxon = phylo->get(i);
110         
111                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
112                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
113                                         for (int j = 0; j < taxon.accessions.size(); j++) {
114                                                 seqGroup[taxon.accessions[j]] = numGroups;
115                                         }
116                                         numGroups++;
117                                 }
118                         }
119                 }
120         
121                 delete phylo;
122                 
123                 if (method == "classify") {
124                         splitDistanceFileByTax(seqGroup, numGroups);
125                 }else {
126                         createDistanceFilesFromTax(seqGroup, numGroups);
127                 }
128                 
129                 return 0;
130                         
131         }
132         catch(exception& e) {
133                 m->errorOut(e, "SplitMatrix", "splitClassify");
134                 exit(1);
135         }
136 }
137 /***********************************************************************/
138 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
139         try {
140                 map<string, int> copyGroups = seqGroup;
141                 map<string, int>::iterator it;
142                 set<string> names;
143                                 
144                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
145                         remove((fastafile + "." + toString(i) + ".temp").c_str());
146                 }
147                         
148                 ifstream in;
149                 openInputFile(fastafile, in);
150         
151                 //parse fastafile
152                 ofstream outFile;
153                 while (!in.eof()) {
154                         Sequence query(in); gobble(in);
155                         if (query.getName() != "") {
156                 
157                                 it = seqGroup.find(query.getName());
158                                 
159                                 //save names in case no namefile is given
160                                 if (namefile == "") {  names.insert(query.getName()); }
161                         
162                                 if (it != seqGroup.end()) { //not singleton 
163                                         openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
164                                         query.printSequence(outFile); 
165                                         outFile.close();
166                                         
167                                         copyGroups.erase(query.getName());
168                                 }
169                         }
170                 }
171                 in.close();
172                 
173                 //warn about sequence in groups that are not in fasta file
174                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
175                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
176                         exit(1);
177                 }
178                 
179                 copyGroups.clear();
180                 
181                 //process each distance file
182                 for (int i = 0; i < numGroups; i++) { 
183                         
184                         string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(cutoff);
185                         
186                         Command* command = new DistanceCommand(options);
187                         command->execute();
188                         delete command;
189                         
190                         remove((fastafile + "." + toString(i) + ".temp").c_str());
191                         
192                         //remove old names files just in case
193                         remove((namefile + "." + toString(i) + ".temp").c_str());
194                 }
195                 
196                 singleton = namefile + ".extra.temp";
197                 ofstream remainingNames;
198                 openOutputFile(singleton, remainingNames);
199                 
200                 bool wroteExtra = false;
201
202                 ifstream bigNameFile;
203                 openInputFile(namefile, bigNameFile);
204                 
205                 string name, nameList;
206                 while(!bigNameFile.eof()){
207                         bigNameFile >> name >> nameList;  gobble(bigNameFile);
208                         
209                         //did this sequence get assigned a group
210                         it = seqGroup.find(name);
211                         
212                         if (it != seqGroup.end()) {  
213                                 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
214                                 outFile << name << '\t' << nameList << endl;
215                                 outFile.close();
216                         }else{
217                                 wroteExtra = true;
218                                 remainingNames << name << '\t' << nameList << endl;
219                         }
220                 }
221                 bigNameFile.close();
222                 
223                 for(int i=0;i<numGroups;i++){
224                         string tempNameFile = namefile + "." + toString(i) + ".temp";
225                         string tempDistFile = outputDir + getRootName(getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
226
227                         //if there are valid distances
228                         ifstream fileHandle;
229                         fileHandle.open(tempDistFile.c_str());
230                         if(fileHandle)  {       
231                                 gobble(fileHandle);
232                                 if (!fileHandle.eof()) {  //check for blank file - this could occur if all dists in group are above cutoff
233                                         map<string, string> temp;
234                                         temp[tempDistFile] = tempNameFile;
235                                         dists.push_back(temp);
236                                 }else {
237                                         ifstream in;
238                                         openInputFile(tempNameFile, in);
239                                 
240                                         while(!in.eof()) { 
241                                                 in >> name >> nameList;  gobble(in);
242                                                 wroteExtra = true;
243                                                 remainingNames << name << '\t' << nameList << endl;
244                                         }
245                                         in.close();
246                                         remove(tempNameFile.c_str());
247                                 }
248                         }
249                         fileHandle.close();
250                 }
251                 
252                 remainingNames.close();
253                 if (!wroteExtra) { 
254                         remove(singleton.c_str());
255                         singleton = "none";
256                 }
257
258                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
259                 
260                 return 0;
261         }
262         catch(exception& e) {
263                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
264                 exit(1);
265         }
266 }
267 /***********************************************************************/
268 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
269         try {
270                 map<string, int>::iterator it;
271                 map<string, int>::iterator it2;
272                 
273                 ifstream dFile;
274                 openInputFile(distFile, dFile);
275                 ofstream outFile;
276                 
277                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
278                         remove((distFile + "." + toString(i) + ".temp").c_str());
279                 }
280                 
281                 //for buffering the io to improve speed
282                  //allow for 10 dists to be stored, then output.
283                 vector<string> outputs;  outputs.resize(numGroups, "");
284                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
285                 
286                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
287                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
288                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
289                 
290                 //for each distance
291                 while(dFile){
292                         string seqA, seqB;
293                         float dist;
294                         
295                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
296                         
297                         dFile >> seqA >> seqB >> dist;  gobble(dFile);
298                         
299                         //if both sequences are in the same group then they are within the cutoff
300                         it = seqGroup.find(seqA);
301                         it2 = seqGroup.find(seqB);
302                         
303                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
304                                 if (it->second == it2->second) { //they are from the same group so add the distance
305                                         if (numOutputs[it->second] > 30) {
306                                                 openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
307                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
308                                                 outFile.close();
309                                                 outputs[it->second] = "";
310                                                 numOutputs[it->second] = 0;
311                                                 validDistances[it->second] = true;
312                                         }else{
313                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
314                                                 numOutputs[it->second]++;
315                                         }
316                                 }
317                         }
318                 }
319                 dFile.close();
320         
321                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
322                         remove((namefile + "." + toString(i) + ".temp").c_str());
323                         
324                         //write out any remaining buffers
325                         if (numOutputs[i] > 0) {
326                                 openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
327                                 outFile << outputs[i];
328                                 outFile.close();
329                                 outputs[i] = "";
330                                 numOutputs[i] = 0;
331                                 validDistances[i] = true;
332                         }
333                 }
334                 
335                 ifstream bigNameFile;
336                 openInputFile(namefile, bigNameFile);
337                 
338                 singleton = namefile + ".extra.temp";
339                 ofstream remainingNames;
340                 openOutputFile(singleton, remainingNames);
341                 
342                 bool wroteExtra = false;
343                                                 
344                 string name, nameList;
345                 while(!bigNameFile.eof()){
346                         bigNameFile >> name >> nameList;  gobble(bigNameFile);
347                         
348                         //did this sequence get assigned a group
349                         it = seqGroup.find(name);
350                         
351                         if (it != seqGroup.end()) {  
352                                 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
353                                 outFile << name << '\t' << nameList << endl;
354                                 outFile.close();
355                         }else{
356                                 wroteExtra = true;
357                                 remainingNames << name << '\t' << nameList << endl;
358                         }
359                 }
360                 bigNameFile.close();
361                                 
362                 for(int i=0;i<numGroups;i++){
363                         string tempNameFile = namefile + "." + toString(i) + ".temp";
364                         string tempDistFile = distFile + "." + toString(i) + ".temp";
365
366                         //if there are valid distances
367                         if (validDistances[i]) {
368                                 map<string, string> temp;
369                                 temp[tempDistFile] = tempNameFile;
370                                 dists.push_back(temp);
371                         }else{
372                                 ifstream in;
373                                 openInputFile(tempNameFile, in);
374                                 
375                                 while(!in.eof()) { 
376                                         in >> name >> nameList;  gobble(in);
377                                         wroteExtra = true;
378                                         remainingNames << name << '\t' << nameList << endl;
379                                 }
380                                 in.close();
381                                 remove(tempNameFile.c_str());
382                         }
383                 }
384                 
385                 remainingNames.close();
386                 
387                 if (!wroteExtra) { 
388                         remove(singleton.c_str());
389                         singleton = "none";
390                 }
391
392                 if (m->control_pressed)  {  
393                         for (int i = 0; i < dists.size(); i++) { 
394                                 remove((dists[i].begin()->first).c_str());
395                                 remove((dists[i].begin()->second).c_str());
396                         }
397                         dists.clear();
398                 }
399                 
400                 return 0;
401         }
402         catch(exception& e) {
403                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
404                 exit(1);
405         }
406 }
407 /***********************************************************************/
408 int SplitMatrix::splitDistanceLarge(){
409         try {
410                 vector<set<string> > groups;
411                 
412                 //for buffering the io to improve speed
413                  //allow for 30 dists to be stored, then output.
414                 vector<string> outputs;
415                 vector<int> numOutputs;
416                 vector<bool> wroteOutPut;
417                 
418                 int numGroups = 0;
419
420                 ofstream outFile;
421                 ifstream dFile;
422                 openInputFile(distFile, dFile);
423         
424                 while(dFile){
425                         string seqA, seqB;
426                         float dist;
427
428                         dFile >> seqA >> seqB >> dist;
429                         
430                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
431                                         
432                         if(dist < cutoff){
433                                 //cout << "in cutoff: " << dist << endl;
434                                 int groupIDA = -1;
435                                 int groupIDB = -1;
436                                 int groupID = -1;
437                                 
438                                 for(int i=0;i<numGroups;i++){
439                                         set<string>::iterator aIt = groups[i].find(seqA);
440                                         set<string>::iterator bIt = groups[i].find(seqB);
441                                         
442                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
443                                                 groups[i].insert(seqB);
444                                                 groupIDA = i;
445                                                 groupID = groupIDA;
446
447                                                 //cout << "in aIt: " << groupID << endl;
448         //                                      break;
449                                         }
450                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
451                                                 groups[i].insert(seqA);
452                                                 groupIDB = i;
453                                                 groupID = groupIDB;
454
455                                         //      cout << "in bIt: " << groupID << endl;
456         //                                      break;
457                                         }
458                                 
459                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
460                                                 if(groupIDA < groupIDB){
461                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
462                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
463                                                         groups[groupIDB].clear(); 
464                                                         groupID = groupIDA;
465                                                 }
466                                                 else{
467                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
468                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
469                                                         groups[groupIDA].clear();  
470                                                         groupID = groupIDB;
471                                                 }
472                                                 break;
473                                         }
474                                 }
475                                 
476         //windows is gonna gag on the reuse of outFile, will need to make it local...
477                                 
478                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
479                                         set<string> newGroup;
480                                         newGroup.insert(seqA);
481                                         newGroup.insert(seqB);
482                                         groups.push_back(newGroup);
483                                                                         
484                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
485                                         outputs.push_back(tempOut);
486                                         numOutputs.push_back(1);
487                                         wroteOutPut.push_back(false);
488                                         
489                                         numGroups++;
490                                 }
491                                 else{
492                                         string fileName = distFile + "." + toString(groupID) + ".temp";
493                                                                                         
494                                         //have we reached the max buffer size
495                                         if (numOutputs[groupID] > 60) { //write out sequence
496                                                 outFile.open(fileName.c_str(), ios::app);
497                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
498                                                 outFile.close();
499                                                 
500                                                 outputs[groupID] = "";
501                                                 numOutputs[groupID] = 0;
502                                                 wroteOutPut[groupID] = true;
503                                         }else {
504                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
505                                                 numOutputs[groupID]++;
506                                         }
507                                         
508                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
509                                                 string row, column, distance;
510                                                 if(groupIDA<groupIDB){
511                                                         
512                                                         //merge memory
513                                                         numOutputs[groupID] += numOutputs[groupIDB];
514                                                         outputs[groupID] += outputs[groupIDB];
515                                                         
516                                                         outputs[groupIDB] = "";
517                                                         numOutputs[groupIDB] = 0;
518                                                         
519                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
520                                                         if (wroteOutPut[groupIDB]) {
521                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
522                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
523                                                                 
524                                                                 outFile.open(fileName.c_str(), ios::app);
525                                                                 
526                                                                 long size;
527                                                                 char* memblock;
528
529                                                                 size = fileB.tellg();
530                                 
531                                                                 fileB.seekg (0, ios::beg);
532                                                                 
533                                                                 int numRead = size / 1024;
534                                                                 int lastRead = size % 1024;
535
536                                                                 for (int i = 0; i < numRead; i++) {
537                                 
538                                                                         memblock = new char [1024];
539                                                                 
540                                                                         fileB.read (memblock, 1024);
541                                                                         
542                                                                         string temp = memblock;
543                                                                         outFile << temp.substr(0, 1024);
544                                                                         
545                                                                         delete memblock;
546                                                                 }
547                                                                 
548                                                                 memblock = new char [lastRead];
549                                                                 
550                                                                 fileB.read (memblock, lastRead);
551                                                                 
552                                                                 //not sure why but it will read more than lastRead char...??
553                                                                 string temp = memblock;
554                                                                 outFile << temp.substr(0, lastRead);
555                                                                 delete memblock;
556                                                                 
557                                                                 fileB.close();
558                                                                 remove(fileName2.c_str());
559                                                                 
560                                                                 //write out the merged memory
561                                                                 if (numOutputs[groupID] > 60) {
562                                                                         outFile << outputs[groupID];
563                                                                         outputs[groupID] = "";
564                                                                         numOutputs[groupID] = 0;
565                                                                 }
566                                                                 
567                                                                 outFile.close();
568                                                                 
569                                                                 wroteOutPut[groupID] = true;
570                                                                 wroteOutPut[groupIDB] = false;
571                                                         }else{ } //just merge b's memory with a's memory 
572                                                 }
573                                                 else{
574                                                         numOutputs[groupID] += numOutputs[groupIDA];
575                                                         outputs[groupID] += outputs[groupIDA];
576                                                         
577                                                         outputs[groupIDA] = "";
578                                                         numOutputs[groupIDA] = 0;
579                                                         
580                                                         if (wroteOutPut[groupIDA]) {
581                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
582                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
583                                                                 
584                                                                 outFile.open(fileName.c_str(), ios::app);
585                                                                 
586                                                                 long size;
587                                                                 char* memblock;
588
589                                                                 size = fileB.tellg();
590                                                                                                                         
591                                                                 fileB.seekg (0, ios::beg);
592                                                                 
593                                                                 int numRead = size / 1024;
594                                                                 int lastRead = size % 1024;
595
596                                                                 for (int i = 0; i < numRead; i++) {
597                                 
598                                                                         memblock = new char [1024];
599                                                                 
600                                                                         fileB.read (memblock, 1024);
601                                                                         string temp = memblock;
602                                                                         outFile << temp.substr(0, 1024);
603                                                                         
604                                                                         delete memblock;
605                                                                 }
606                                                                 
607                                                                 memblock = new char [lastRead];
608                                                                 
609                                                                 fileB.read (memblock, lastRead);
610                                                                 
611                                                                 //not sure why but it will read more than lastRead char...??
612                                                                 string temp = memblock;
613                                                                 outFile << temp.substr(0, lastRead);
614                                                                         
615                                                                 delete memblock;
616                                                                 
617                                                                 fileB.close();
618                                                                 remove(fileName2.c_str());
619                                                                 
620                                                                 //write out the merged memory
621                                                                 if (numOutputs[groupID] > 60) {
622                                                                         outFile << outputs[groupID];
623                                                                         outputs[groupID] = "";
624                                                                         numOutputs[groupID] = 0;
625                                                                 }
626                                                                 
627                                                                 outFile.close();
628                                                                 
629                                                                 wroteOutPut[groupID] = true;
630                                                                 wroteOutPut[groupIDA] = false;
631                                                         }else { } //just merge memory
632                                                 }                                       
633                                         }
634                                 }
635                         }
636                         gobble(dFile);
637                 }
638                 dFile.close();
639                 
640                 for (int i = 0; i < numGroups; i++) {
641                         if (numOutputs[i] > 0) {
642                                 string fileName = distFile + "." + toString(i) + ".temp";
643                                 outFile.open(fileName.c_str(), ios::app);
644                                 outFile << outputs[i];
645                                 outFile.close();
646                         }
647                 }
648
649                 splitNames(groups);
650                                 
651                 return 0;                       
652         }
653         catch(exception& e) {
654                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
655                 exit(1);
656         }
657 }
658 //********************************************************************************************************************
659 int SplitMatrix::splitNames(vector<set<string> >& groups){
660         try {
661                 int numGroups = groups.size();
662         
663                 ifstream bigNameFile(namefile.c_str());
664                 if(!bigNameFile){
665                         cerr << "Error: We can't open the name file\n";
666                         exit(1);
667                 }
668                 
669                 map<string, string> nameMap;
670                 string name, nameList;
671                 while(bigNameFile){
672                         bigNameFile >> name >> nameList;
673                         nameMap[name] = nameList;
674                         gobble(bigNameFile);
675                 }
676                 bigNameFile.close();
677                         
678                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
679                         int numSeqsInGroup = groups[i].size();
680                         
681                         if(numSeqsInGroup > 0){
682                                 string fileName = namefile + "." + toString(i) + ".temp";
683                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
684                                 
685                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
686                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
687                                         if (nIt != nameMap.end()) {
688                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
689                                                 nameMap.erase(nIt);
690                                         }else{
691                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
692                                         }
693                                 }
694                                 smallNameFile.close();
695                         }
696                 }
697                 
698                 //names of singletons
699                 if (nameMap.size() != 0) {
700                         singleton = namefile + ".extra.temp";
701                         ofstream remainingNames(singleton.c_str(), ios::ate);
702                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
703                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
704                         }
705                         remainingNames.close();
706                 }else { singleton = "none"; }
707                         
708                 for(int i=0;i<numGroups;i++){
709                         if(groups[i].size() > 0){
710                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
711                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
712                                 
713                                 map<string, string> temp;
714                                 temp[tempDistFile] = tempNameFile;
715                                 dists.push_back(temp);
716                         }
717                 }
718                 
719                 if (m->control_pressed)  {  
720                         for (int i = 0; i < dists.size(); i++) { 
721                                 remove((dists[i].begin()->first).c_str());
722                                 remove((dists[i].begin()->second).c_str());
723                         }
724                         dists.clear();
725                 }
726                 
727                 return 0;
728         }
729         catch(exception& e) {
730                 m->errorOut(e, "SplitMatrix", "splitNames");
731                 exit(1);
732         }
733 }
734 //********************************************************************************************************************
735 int SplitMatrix::splitDistanceRAM(){
736         try {
737                 vector<set<string> > groups;
738                 vector<string> outputs;
739                 
740                 int numGroups = 0;
741
742                 ifstream dFile;
743                 openInputFile(distFile, dFile);
744
745                 while(dFile){
746                         string seqA, seqB;
747                         float dist;
748
749                         dFile >> seqA >> seqB >> dist;
750                         
751                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
752                                         
753                         if(dist < cutoff){
754                                 //cout << "in cutoff: " << dist << endl;
755                                 int groupIDA = -1;
756                                 int groupIDB = -1;
757                                 int groupID = -1;
758                                 
759                                 for(int i=0;i<numGroups;i++){
760                                         set<string>::iterator aIt = groups[i].find(seqA);
761                                         set<string>::iterator bIt = groups[i].find(seqB);
762                                         
763                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
764                                                 groups[i].insert(seqB);
765                                                 groupIDA = i;
766                                                 groupID = groupIDA;
767
768                                                 //cout << "in aIt: " << groupID << endl;
769         //                                      break;
770                                         }
771                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
772                                                 groups[i].insert(seqA);
773                                                 groupIDB = i;
774                                                 groupID = groupIDB;
775
776                                         //      cout << "in bIt: " << groupID << endl;
777         //                                      break;
778                                         }
779                                 
780                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
781                                                 if(groupIDA < groupIDB){
782                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
783                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
784                                                         groups[groupIDB].clear(); 
785                                                         groupID = groupIDA;
786                                                 }
787                                                 else{
788                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
789                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
790                                                         groups[groupIDA].clear();  
791                                                         groupID = groupIDB;
792                                                 }
793                                                 break;
794                                         }
795                                 }
796                                 
797         //windows is gonna gag on the reuse of outFile, will need to make it local...
798                                 
799                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
800                                         set<string> newGroup;
801                                         newGroup.insert(seqA);
802                                         newGroup.insert(seqB);
803                                         groups.push_back(newGroup);
804                                                                         
805                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
806                                         outputs.push_back(tempOut);
807                                         numGroups++;
808                                 }
809                                 else{
810                                                                                         
811                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
812                                         
813                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
814                                                 string row, column, distance;
815                                                 if(groupIDA<groupIDB){
816                                                         //merge memory
817                                                         outputs[groupID] += outputs[groupIDB];
818                                                         outputs[groupIDB] = "";
819                                                 }else{
820                                                         outputs[groupID] += outputs[groupIDA];
821                                                         outputs[groupIDA] = "";
822                                                 }                                       
823                                         }
824                                 }
825                         }
826                         gobble(dFile);
827                 }
828                 dFile.close();
829                 
830                 for (int i = 0; i < numGroups; i++) {
831                         if (outputs[i] != "") {
832                                 ofstream outFile;
833                                 string fileName = distFile + "." + toString(i) + ".temp";
834                                 outFile.open(fileName.c_str(), ios::ate);
835                                 outFile << outputs[i];
836                                 outFile.close();
837                         }
838                 }
839
840                 splitNames(groups);
841                                 
842                 return 0;                       
843         }
844         catch(exception& e) {
845                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
846                 exit(1);
847         }
848 }
849 //********************************************************************************************************************
850 //sorts biggest to smallest
851 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
852         
853         FILE * pFile;
854         long leftsize = 0;
855                 
856         //get num bytes in file
857         string filename = left.begin()->first;
858         pFile = fopen (filename.c_str(),"rb");
859         string error = "Error opening " + filename;
860         if (pFile==NULL) perror (error.c_str());
861         else{
862                 fseek (pFile, 0, SEEK_END);
863                 leftsize=ftell (pFile);
864                 fclose (pFile);
865         }
866
867         FILE * pFile2;
868         long rightsize = 0;
869                 
870         //get num bytes in file
871         filename = right.begin()->first;
872         pFile2 = fopen (filename.c_str(),"rb");
873         error = "Error opening " + filename;
874         if (pFile2==NULL) perror (error.c_str());
875         else{
876                 fseek (pFile2, 0, SEEK_END);
877                 rightsize=ftell (pFile2);
878                 fclose (pFile2);
879         }
880
881         return (leftsize > rightsize);  
882
883 /***********************************************************************/
884 //returns map of distance files -> namefile sorted by distance file size
885 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
886         try {   
887                 
888                 sort(dists.begin(), dists.end(), compareFileSizes);
889                 
890                 return dists;
891         }
892         catch(exception& e) {
893                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
894                 exit(1);
895         }
896 }
897 /***********************************************************************/
898 SplitMatrix::~SplitMatrix(){}
899 /***********************************************************************/
900