]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
fixed cluster.split bug
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
14
15 /***********************************************************************/
16
17 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
18         m = MothurOut::getInstance();
19         distFile = distfile;
20         cutoff = c;
21         namefile = name;
22         method = t;
23         taxFile = tax;
24         large = l;
25 }
26 /***********************************************************************/
27
28 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, string output){
29         m = MothurOut::getInstance();
30         fastafile = ffile;
31         namefile = name;
32         taxFile = tax;
33         cutoff = c;  //tax level cutoff
34         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
35         method = t;
36         processors = p;
37         outputDir = output;
38 }
39
40 /***********************************************************************/
41
42 int SplitMatrix::split(){
43         try {
44         
45                 if (method == "distance") {  
46                         splitDistance();
47                 }else if ((method == "classify") || (method == "fasta")) {
48                         splitClassify();
49                 }else {
50                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
51                         map<string, string> temp;
52                         temp[distFile] = namefile;
53                         dists.push_back(temp);
54                 }
55                 
56                 return 0;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "SplitMatrix", "split");
60                 exit(1);
61         }
62 }
63 /***********************************************************************/
64 int SplitMatrix::splitDistance(){
65         try {
66         
67                 if (large)      { splitDistanceLarge(); }
68                 else            { splitDistanceRAM();   }
69                         
70         }
71         catch(exception& e) {
72                 m->errorOut(e, "SplitMatrix", "splitDistance");
73                 exit(1);
74         }
75 }
76
77 /***********************************************************************/
78 int SplitMatrix::splitClassify(){
79         try {
80                 cutoff = int(cutoff);
81                                 
82                 map<string, int> seqGroup;
83                 map<string, int>::iterator it;
84                 map<string, int>::iterator it2;
85                 
86                 int numGroups = 0;
87                 
88                 //build tree from users taxonomy file
89                 PhyloTree* phylo = new PhyloTree();
90                 
91                 ifstream in;
92                 m->openInputFile(taxFile, in);
93                         
94                 //read in users taxonomy file and add sequences to tree
95                 string seqname, tax;
96                 while(!in.eof()){
97                         in >> seqname >> tax; m->gobble(in);
98                         phylo->addSeqToTree(seqname, tax);
99                 }
100                 in.close();
101                 
102                 phylo->assignHeirarchyIDs(0);
103
104                 //make sure the cutoff is not greater than maxlevel
105                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
106         
107                 //for each node in tree
108                 for (int i = 0; i < phylo->getNumNodes(); i++) {
109                 
110                         //is this node within the cutoff
111                         TaxNode taxon = phylo->get(i);
112         
113                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
114                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
115                                         for (int j = 0; j < taxon.accessions.size(); j++) {
116                                                 seqGroup[taxon.accessions[j]] = numGroups;
117                                         }
118                                         numGroups++;
119                                 }
120                         }
121                 }
122         
123                 delete phylo;
124                 
125                 if (method == "classify") {
126                         splitDistanceFileByTax(seqGroup, numGroups);
127                 }else {
128                         createDistanceFilesFromTax(seqGroup, numGroups);
129                 }
130                 
131                 return 0;
132                         
133         }
134         catch(exception& e) {
135                 m->errorOut(e, "SplitMatrix", "splitClassify");
136                 exit(1);
137         }
138 }
139 /***********************************************************************/
140 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
141         try {
142                 map<string, int> copyGroups = seqGroup;
143                 map<string, int>::iterator it;
144                 set<string> names;
145                                 
146                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
147                         remove((fastafile + "." + toString(i) + ".temp").c_str());
148                 }
149                         
150                 ifstream in;
151                 m->openInputFile(fastafile, in);
152         
153                 //parse fastafile
154                 ofstream outFile;
155                 while (!in.eof()) {
156                         Sequence query(in); m->gobble(in);
157                         if (query.getName() != "") {
158                 
159                                 it = seqGroup.find(query.getName());
160                                 
161                                 //save names in case no namefile is given
162                                 if (namefile == "") {  names.insert(query.getName()); }
163                         
164                                 if (it != seqGroup.end()) { //not singleton 
165                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
166                                         query.printSequence(outFile); 
167                                         outFile.close();
168                                         
169                                         copyGroups.erase(query.getName());
170                                 }
171                         }
172                 }
173                 in.close();
174                 
175                 //warn about sequence in groups that are not in fasta file
176                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
177                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
178                         exit(1);
179                 }
180                 
181                 copyGroups.clear();
182                 
183                 //process each distance file
184                 for (int i = 0; i < numGroups; i++) { 
185                         
186                         string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff);
187                         
188                         Command* command = new DistanceCommand(options);
189                         
190                         command->execute();
191                         delete command;
192                         
193                         remove((fastafile + "." + toString(i) + ".temp").c_str());
194                         
195                         //remove old names files just in case
196                         remove((namefile + "." + toString(i) + ".temp").c_str());
197                 }
198                         
199                 singleton = namefile + ".extra.temp";
200                 ofstream remainingNames;
201                 m->openOutputFile(singleton, remainingNames);
202                 
203                 bool wroteExtra = false;
204
205                 ifstream bigNameFile;
206                 m->openInputFile(namefile, bigNameFile);
207                 
208                 string name, nameList;
209                 while(!bigNameFile.eof()){
210                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
211                         
212                         //did this sequence get assigned a group
213                         it = seqGroup.find(name);
214                         
215                         if (it != seqGroup.end()) {  
216                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
217                                 outFile << name << '\t' << nameList << endl;
218                                 outFile.close();
219                         }else{
220                                 wroteExtra = true;
221                                 remainingNames << name << '\t' << nameList << endl;
222                         }
223                 }
224                 bigNameFile.close();
225                 
226                 for(int i=0;i<numGroups;i++){
227                         string tempNameFile = namefile + "." + toString(i) + ".temp";
228                         if (outputDir == "") { outputDir = m->hasPath(fastafile); }
229                         string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
230
231                         //if there are valid distances
232                         ifstream fileHandle;
233                         fileHandle.open(tempDistFile.c_str());
234                         if(fileHandle)  {       
235                                 m->gobble(fileHandle);
236                                 if (!fileHandle.eof()) {  //check for blank file - this could occur if all dists in group are above cutoff
237                                         map<string, string> temp;
238                                         temp[tempDistFile] = tempNameFile;
239                                         dists.push_back(temp);
240                                 }else {
241                                         ifstream in;
242                                         m->openInputFile(tempNameFile, in);
243                                 
244                                         while(!in.eof()) { 
245                                                 in >> name >> nameList;  m->gobble(in);
246                                                 wroteExtra = true;
247                                                 remainingNames << name << '\t' << nameList << endl;
248                                         }
249                                         in.close();
250                                         remove(tempNameFile.c_str());
251                                 }
252                         }
253                         fileHandle.close();
254                 }
255                 
256                 remainingNames.close();
257                 if (!wroteExtra) { 
258                         remove(singleton.c_str());
259                         singleton = "none";
260                 }
261
262                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
263                 
264                 return 0;
265         }
266         catch(exception& e) {
267                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
268                 exit(1);
269         }
270 }
271 /***********************************************************************/
272 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
273         try {
274                 map<string, int>::iterator it;
275                 map<string, int>::iterator it2;
276                 
277                 ifstream dFile;
278                 m->openInputFile(distFile, dFile);
279                 ofstream outFile;
280                 
281                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
282                         remove((distFile + "." + toString(i) + ".temp").c_str());
283                 }
284                 
285                 //for buffering the io to improve speed
286                  //allow for 10 dists to be stored, then output.
287                 vector<string> outputs;  outputs.resize(numGroups, "");
288                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
289                 
290                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
291                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
292                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
293                 
294                 //for each distance
295                 while(dFile){
296                         string seqA, seqB;
297                         float dist;
298                         
299                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
300                         
301                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
302                         
303                         //if both sequences are in the same group then they are within the cutoff
304                         it = seqGroup.find(seqA);
305                         it2 = seqGroup.find(seqB);
306                         
307                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
308                                 if (it->second == it2->second) { //they are from the same group so add the distance
309                                         if (numOutputs[it->second] > 30) {
310                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
311                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
312                                                 outFile.close();
313                                                 outputs[it->second] = "";
314                                                 numOutputs[it->second] = 0;
315                                                 validDistances[it->second] = true;
316                                         }else{
317                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
318                                                 numOutputs[it->second]++;
319                                         }
320                                 }
321                         }
322                 }
323                 dFile.close();
324         
325                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
326                         remove((namefile + "." + toString(i) + ".temp").c_str());
327                         
328                         //write out any remaining buffers
329                         if (numOutputs[i] > 0) {
330                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
331                                 outFile << outputs[i];
332                                 outFile.close();
333                                 outputs[i] = "";
334                                 numOutputs[i] = 0;
335                                 validDistances[i] = true;
336                         }
337                 }
338                 
339                 ifstream bigNameFile;
340                 m->openInputFile(namefile, bigNameFile);
341                 
342                 singleton = namefile + ".extra.temp";
343                 ofstream remainingNames;
344                 m->openOutputFile(singleton, remainingNames);
345                 
346                 bool wroteExtra = false;
347                                                 
348                 string name, nameList;
349                 while(!bigNameFile.eof()){
350                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
351                         
352                         //did this sequence get assigned a group
353                         it = seqGroup.find(name);
354                         
355                         if (it != seqGroup.end()) {  
356                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
357                                 outFile << name << '\t' << nameList << endl;
358                                 outFile.close();
359                         }else{
360                                 wroteExtra = true;
361                                 remainingNames << name << '\t' << nameList << endl;
362                         }
363                 }
364                 bigNameFile.close();
365                                 
366                 for(int i=0;i<numGroups;i++){
367                         string tempNameFile = namefile + "." + toString(i) + ".temp";
368                         string tempDistFile = distFile + "." + toString(i) + ".temp";
369
370                         //if there are valid distances
371                         if (validDistances[i]) {
372                                 map<string, string> temp;
373                                 temp[tempDistFile] = tempNameFile;
374                                 dists.push_back(temp);
375                         }else{
376                                 ifstream in;
377                                 m->openInputFile(tempNameFile, in);
378                                 
379                                 while(!in.eof()) { 
380                                         in >> name >> nameList;  m->gobble(in);
381                                         wroteExtra = true;
382                                         remainingNames << name << '\t' << nameList << endl;
383                                 }
384                                 in.close();
385                                 remove(tempNameFile.c_str());
386                         }
387                 }
388                 
389                 remainingNames.close();
390                 
391                 if (!wroteExtra) { 
392                         remove(singleton.c_str());
393                         singleton = "none";
394                 }
395
396                 if (m->control_pressed)  {  
397                         for (int i = 0; i < dists.size(); i++) { 
398                                 remove((dists[i].begin()->first).c_str());
399                                 remove((dists[i].begin()->second).c_str());
400                         }
401                         dists.clear();
402                 }
403                 
404                 return 0;
405         }
406         catch(exception& e) {
407                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
408                 exit(1);
409         }
410 }
411 /***********************************************************************/
412 int SplitMatrix::splitDistanceLarge(){
413         try {
414                 vector<set<string> > groups;
415                 
416                 //for buffering the io to improve speed
417                  //allow for 30 dists to be stored, then output.
418                 vector<string> outputs;
419                 vector<int> numOutputs;
420                 vector<bool> wroteOutPut;
421                 
422                 int numGroups = 0;
423
424                 ofstream outFile;
425                 ifstream dFile;
426                 m->openInputFile(distFile, dFile);
427         
428                 while(dFile){
429                         string seqA, seqB;
430                         float dist;
431
432                         dFile >> seqA >> seqB >> dist;
433                         
434                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
435                                         
436                         if(dist < cutoff){
437                                 //cout << "in cutoff: " << dist << endl;
438                                 int groupIDA = -1;
439                                 int groupIDB = -1;
440                                 int groupID = -1;
441                                 
442                                 for(int i=0;i<numGroups;i++){
443                                         set<string>::iterator aIt = groups[i].find(seqA);
444                                         set<string>::iterator bIt = groups[i].find(seqB);
445                                         
446                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
447                                                 groups[i].insert(seqB);
448                                                 groupIDA = i;
449                                                 groupID = groupIDA;
450
451                                                 //cout << "in aIt: " << groupID << endl;
452         //                                      break;
453                                         }
454                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
455                                                 groups[i].insert(seqA);
456                                                 groupIDB = i;
457                                                 groupID = groupIDB;
458
459                                         //      cout << "in bIt: " << groupID << endl;
460         //                                      break;
461                                         }
462                                 
463                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
464                                                 if(groupIDA < groupIDB){
465                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
466                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
467                                                         groups[groupIDB].clear(); 
468                                                         groupID = groupIDA;
469                                                 }
470                                                 else{
471                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
472                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
473                                                         groups[groupIDA].clear();  
474                                                         groupID = groupIDB;
475                                                 }
476                                                 break;
477                                         }
478                                 }
479                                 
480         //windows is gonna gag on the reuse of outFile, will need to make it local...
481                                 
482                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
483                                         set<string> newGroup;
484                                         newGroup.insert(seqA);
485                                         newGroup.insert(seqB);
486                                         groups.push_back(newGroup);
487                                                                         
488                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
489                                         outputs.push_back(tempOut);
490                                         numOutputs.push_back(1);
491                                         wroteOutPut.push_back(false);
492                                         
493                                         numGroups++;
494                                 }
495                                 else{
496                                         string fileName = distFile + "." + toString(groupID) + ".temp";
497                                                                                         
498                                         //have we reached the max buffer size
499                                         if (numOutputs[groupID] > 60) { //write out sequence
500                                                 outFile.open(fileName.c_str(), ios::app);
501                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
502                                                 outFile.close();
503                                                 
504                                                 outputs[groupID] = "";
505                                                 numOutputs[groupID] = 0;
506                                                 wroteOutPut[groupID] = true;
507                                         }else {
508                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
509                                                 numOutputs[groupID]++;
510                                         }
511                                         
512                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
513                                                 string row, column, distance;
514                                                 if(groupIDA<groupIDB){
515                                                         
516                                                         //merge memory
517                                                         numOutputs[groupID] += numOutputs[groupIDB];
518                                                         outputs[groupID] += outputs[groupIDB];
519                                                         
520                                                         outputs[groupIDB] = "";
521                                                         numOutputs[groupIDB] = 0;
522                                                         
523                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
524                                                         if (wroteOutPut[groupIDB]) {
525                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
526                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
527                                                                 
528                                                                 outFile.open(fileName.c_str(), ios::app);
529                                                                 
530                                                                 long size;
531                                                                 char* memblock;
532
533                                                                 size = fileB.tellg();
534                                 
535                                                                 fileB.seekg (0, ios::beg);
536                                                                 
537                                                                 int numRead = size / 1024;
538                                                                 int lastRead = size % 1024;
539
540                                                                 for (int i = 0; i < numRead; i++) {
541                                 
542                                                                         memblock = new char [1024];
543                                                                 
544                                                                         fileB.read (memblock, 1024);
545                                                                         
546                                                                         string temp = memblock;
547                                                                         outFile << temp.substr(0, 1024);
548                                                                         
549                                                                         delete memblock;
550                                                                 }
551                                                                 
552                                                                 memblock = new char [lastRead];
553                                                                 
554                                                                 fileB.read (memblock, lastRead);
555                                                                 
556                                                                 //not sure why but it will read more than lastRead char...??
557                                                                 string temp = memblock;
558                                                                 outFile << temp.substr(0, lastRead);
559                                                                 delete memblock;
560                                                                 
561                                                                 fileB.close();
562                                                                 remove(fileName2.c_str());
563                                                                 
564                                                                 //write out the merged memory
565                                                                 if (numOutputs[groupID] > 60) {
566                                                                         outFile << outputs[groupID];
567                                                                         outputs[groupID] = "";
568                                                                         numOutputs[groupID] = 0;
569                                                                 }
570                                                                 
571                                                                 outFile.close();
572                                                                 
573                                                                 wroteOutPut[groupID] = true;
574                                                                 wroteOutPut[groupIDB] = false;
575                                                         }else{ } //just merge b's memory with a's memory 
576                                                 }
577                                                 else{
578                                                         numOutputs[groupID] += numOutputs[groupIDA];
579                                                         outputs[groupID] += outputs[groupIDA];
580                                                         
581                                                         outputs[groupIDA] = "";
582                                                         numOutputs[groupIDA] = 0;
583                                                         
584                                                         if (wroteOutPut[groupIDA]) {
585                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
586                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
587                                                                 
588                                                                 outFile.open(fileName.c_str(), ios::app);
589                                                                 
590                                                                 long size;
591                                                                 char* memblock;
592
593                                                                 size = fileB.tellg();
594                                                                                                                         
595                                                                 fileB.seekg (0, ios::beg);
596                                                                 
597                                                                 int numRead = size / 1024;
598                                                                 int lastRead = size % 1024;
599
600                                                                 for (int i = 0; i < numRead; i++) {
601                                 
602                                                                         memblock = new char [1024];
603                                                                 
604                                                                         fileB.read (memblock, 1024);
605                                                                         string temp = memblock;
606                                                                         outFile << temp.substr(0, 1024);
607                                                                         
608                                                                         delete memblock;
609                                                                 }
610                                                                 
611                                                                 memblock = new char [lastRead];
612                                                                 
613                                                                 fileB.read (memblock, lastRead);
614                                                                 
615                                                                 //not sure why but it will read more than lastRead char...??
616                                                                 string temp = memblock;
617                                                                 outFile << temp.substr(0, lastRead);
618                                                                         
619                                                                 delete memblock;
620                                                                 
621                                                                 fileB.close();
622                                                                 remove(fileName2.c_str());
623                                                                 
624                                                                 //write out the merged memory
625                                                                 if (numOutputs[groupID] > 60) {
626                                                                         outFile << outputs[groupID];
627                                                                         outputs[groupID] = "";
628                                                                         numOutputs[groupID] = 0;
629                                                                 }
630                                                                 
631                                                                 outFile.close();
632                                                                 
633                                                                 wroteOutPut[groupID] = true;
634                                                                 wroteOutPut[groupIDA] = false;
635                                                         }else { } //just merge memory
636                                                 }                                       
637                                         }
638                                 }
639                         }
640                         m->gobble(dFile);
641                 }
642                 dFile.close();
643                 
644                 for (int i = 0; i < numGroups; i++) {
645                         if (numOutputs[i] > 0) {
646                                 string fileName = distFile + "." + toString(i) + ".temp";
647                                 outFile.open(fileName.c_str(), ios::app);
648                                 outFile << outputs[i];
649                                 outFile.close();
650                         }
651                 }
652
653                 splitNames(groups);
654                                 
655                 return 0;                       
656         }
657         catch(exception& e) {
658                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
659                 exit(1);
660         }
661 }
662 //********************************************************************************************************************
663 int SplitMatrix::splitNames(vector<set<string> >& groups){
664         try {
665                 int numGroups = groups.size();
666         
667                 ifstream bigNameFile(namefile.c_str());
668                 if(!bigNameFile){
669                         cerr << "Error: We can't open the name file\n";
670                         exit(1);
671                 }
672                 
673                 map<string, string> nameMap;
674                 string name, nameList;
675                 while(bigNameFile){
676                         bigNameFile >> name >> nameList;
677                         nameMap[name] = nameList;
678                         m->gobble(bigNameFile);
679                 }
680                 bigNameFile.close();
681                         
682                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
683                         int numSeqsInGroup = groups[i].size();
684                         
685                         if(numSeqsInGroup > 0){
686                                 string fileName = namefile + "." + toString(i) + ".temp";
687                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
688                                 
689                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
690                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
691                                         if (nIt != nameMap.end()) {
692                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
693                                                 nameMap.erase(nIt);
694                                         }else{
695                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
696                                         }
697                                 }
698                                 smallNameFile.close();
699                         }
700                 }
701                 
702                 //names of singletons
703                 if (nameMap.size() != 0) {
704                         singleton = namefile + ".extra.temp";
705                         ofstream remainingNames(singleton.c_str(), ios::ate);
706                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
707                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
708                         }
709                         remainingNames.close();
710                 }else { singleton = "none"; }
711                         
712                 for(int i=0;i<numGroups;i++){
713                         if(groups[i].size() > 0){
714                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
715                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
716                                 
717                                 map<string, string> temp;
718                                 temp[tempDistFile] = tempNameFile;
719                                 dists.push_back(temp);
720                         }
721                 }
722                 
723                 if (m->control_pressed)  {  
724                         for (int i = 0; i < dists.size(); i++) { 
725                                 remove((dists[i].begin()->first).c_str());
726                                 remove((dists[i].begin()->second).c_str());
727                         }
728                         dists.clear();
729                 }
730                 
731                 return 0;
732         }
733         catch(exception& e) {
734                 m->errorOut(e, "SplitMatrix", "splitNames");
735                 exit(1);
736         }
737 }
738 //********************************************************************************************************************
739 int SplitMatrix::splitDistanceRAM(){
740         try {
741                 vector<set<string> > groups;
742                 vector<string> outputs;
743                 
744                 int numGroups = 0;
745
746                 ifstream dFile;
747                 m->openInputFile(distFile, dFile);
748
749                 while(dFile){
750                         string seqA, seqB;
751                         float dist;
752
753                         dFile >> seqA >> seqB >> dist;
754                         
755                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
756                                         
757                         if(dist < cutoff){
758                                 //cout << "in cutoff: " << dist << endl;
759                                 int groupIDA = -1;
760                                 int groupIDB = -1;
761                                 int groupID = -1;
762                                 
763                                 for(int i=0;i<numGroups;i++){
764                                         set<string>::iterator aIt = groups[i].find(seqA);
765                                         set<string>::iterator bIt = groups[i].find(seqB);
766                                         
767                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
768                                                 groups[i].insert(seqB);
769                                                 groupIDA = i;
770                                                 groupID = groupIDA;
771
772                                                 //cout << "in aIt: " << groupID << endl;
773         //                                      break;
774                                         }
775                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
776                                                 groups[i].insert(seqA);
777                                                 groupIDB = i;
778                                                 groupID = groupIDB;
779
780                                         //      cout << "in bIt: " << groupID << endl;
781         //                                      break;
782                                         }
783                                 
784                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
785                                                 if(groupIDA < groupIDB){
786                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
787                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
788                                                         groups[groupIDB].clear(); 
789                                                         groupID = groupIDA;
790                                                 }
791                                                 else{
792                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
793                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
794                                                         groups[groupIDA].clear();  
795                                                         groupID = groupIDB;
796                                                 }
797                                                 break;
798                                         }
799                                 }
800                                 
801         //windows is gonna gag on the reuse of outFile, will need to make it local...
802                                 
803                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
804                                         set<string> newGroup;
805                                         newGroup.insert(seqA);
806                                         newGroup.insert(seqB);
807                                         groups.push_back(newGroup);
808                                                                         
809                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
810                                         outputs.push_back(tempOut);
811                                         numGroups++;
812                                 }
813                                 else{
814                                                                                         
815                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
816                                         
817                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
818                                                 string row, column, distance;
819                                                 if(groupIDA<groupIDB){
820                                                         //merge memory
821                                                         outputs[groupID] += outputs[groupIDB];
822                                                         outputs[groupIDB] = "";
823                                                 }else{
824                                                         outputs[groupID] += outputs[groupIDA];
825                                                         outputs[groupIDA] = "";
826                                                 }                                       
827                                         }
828                                 }
829                         }
830                         m->gobble(dFile);
831                 }
832                 dFile.close();
833                 
834                 for (int i = 0; i < numGroups; i++) {
835                         if (outputs[i] != "") {
836                                 ofstream outFile;
837                                 string fileName = distFile + "." + toString(i) + ".temp";
838                                 outFile.open(fileName.c_str(), ios::ate);
839                                 outFile << outputs[i];
840                                 outFile.close();
841                         }
842                 }
843
844                 splitNames(groups);
845                                 
846                 return 0;                       
847         }
848         catch(exception& e) {
849                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
850                 exit(1);
851         }
852 }
853 //********************************************************************************************************************
854 //sorts biggest to smallest
855 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
856         
857         FILE * pFile;
858         long leftsize = 0;
859                 
860         //get num bytes in file
861         string filename = left.begin()->first;
862         pFile = fopen (filename.c_str(),"rb");
863         string error = "Error opening " + filename;
864         if (pFile==NULL) perror (error.c_str());
865         else{
866                 fseek (pFile, 0, SEEK_END);
867                 leftsize=ftell (pFile);
868                 fclose (pFile);
869         }
870
871         FILE * pFile2;
872         long rightsize = 0;
873                 
874         //get num bytes in file
875         filename = right.begin()->first;
876         pFile2 = fopen (filename.c_str(),"rb");
877         error = "Error opening " + filename;
878         if (pFile2==NULL) perror (error.c_str());
879         else{
880                 fseek (pFile2, 0, SEEK_END);
881                 rightsize=ftell (pFile2);
882                 fclose (pFile2);
883         }
884
885         return (leftsize > rightsize);  
886
887 /***********************************************************************/
888 //returns map of distance files -> namefile sorted by distance file size
889 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
890         try {   
891                 
892                 sort(dists.begin(), dists.end(), compareFileSizes);
893                 
894                 return dists;
895         }
896         catch(exception& e) {
897                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
898                 exit(1);
899         }
900 }
901 /***********************************************************************/
902 SplitMatrix::~SplitMatrix(){}
903 /***********************************************************************/
904