]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
cluster.split fix
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13
14 /***********************************************************************/
15
16 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
17         m = MothurOut::getInstance();
18         distFile = distfile;
19         cutoff = c;
20         namefile = name;
21         method = t;
22         taxFile = tax;
23         large = l;
24 }
25 /***********************************************************************/
26
27 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, string output){
28         m = MothurOut::getInstance();
29         fastafile = ffile;
30         namefile = name;
31         taxFile = tax;
32         cutoff = c;  //tax level cutoff
33         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
34         method = t;
35         processors = p;
36         outputDir = output;
37 }
38
39 /***********************************************************************/
40
41 int SplitMatrix::split(){
42         try {
43         
44                 if (method == "distance") {  
45                         splitDistance();
46                 }else if ((method == "classify") || (method == "fasta")) {
47                         splitClassify();
48                 }else {
49                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
50                         map<string, string> temp;
51                         temp[distFile] = namefile;
52                         dists.push_back(temp);
53                 }
54                 
55                 return 0;
56         }
57         catch(exception& e) {
58                 m->errorOut(e, "SplitMatrix", "split");
59                 exit(1);
60         }
61 }
62 /***********************************************************************/
63 int SplitMatrix::splitDistance(){
64         try {
65         
66                 if (large)      { splitDistanceLarge(); }
67                 else            { splitDistanceRAM();   }
68                         
69         }
70         catch(exception& e) {
71                 m->errorOut(e, "SplitMatrix", "splitDistance");
72                 exit(1);
73         }
74 }
75
76 /***********************************************************************/
77 int SplitMatrix::splitClassify(){
78         try {
79                 cutoff = int(cutoff);
80                                 
81                 map<string, int> seqGroup;
82                 map<string, int>::iterator it;
83                 map<string, int>::iterator it2;
84                 
85                 int numGroups = 0;
86                 
87                 //build tree from users taxonomy file
88                 PhyloTree* phylo = new PhyloTree();
89                 
90                 ifstream in;
91                 m->openInputFile(taxFile, in);
92                         
93                 //read in users taxonomy file and add sequences to tree
94                 string seqname, tax;
95                 while(!in.eof()){
96                         in >> seqname >> tax; m->gobble(in);
97                         phylo->addSeqToTree(seqname, tax);
98                 }
99                 in.close();
100                 
101                 phylo->assignHeirarchyIDs(0);
102
103                 //make sure the cutoff is not greater than maxlevel
104                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
105         
106                 //for each node in tree
107                 for (int i = 0; i < phylo->getNumNodes(); i++) {
108                 
109                         //is this node within the cutoff
110                         TaxNode taxon = phylo->get(i);
111         
112                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
113                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
114                                         for (int j = 0; j < taxon.accessions.size(); j++) {
115                                                 seqGroup[taxon.accessions[j]] = numGroups;
116                                         }
117                                         numGroups++;
118                                 }
119                         }
120                 }
121         
122                 delete phylo;
123                 
124                 if (method == "classify") {
125                         splitDistanceFileByTax(seqGroup, numGroups);
126                 }else {
127                         createDistanceFilesFromTax(seqGroup, numGroups);
128                 }
129                 
130                 return 0;
131                         
132         }
133         catch(exception& e) {
134                 m->errorOut(e, "SplitMatrix", "splitClassify");
135                 exit(1);
136         }
137 }
138 /***********************************************************************/
139 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
140         try {
141                 map<string, int> copyGroups = seqGroup;
142                 map<string, int>::iterator it;
143                 set<string> names;
144                                 
145                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
146                         remove((fastafile + "." + toString(i) + ".temp").c_str());
147                 }
148                         
149                 ifstream in;
150                 m->openInputFile(fastafile, in);
151         
152                 //parse fastafile
153                 ofstream outFile;
154                 while (!in.eof()) {
155                         Sequence query(in); m->gobble(in);
156                         if (query.getName() != "") {
157                 
158                                 it = seqGroup.find(query.getName());
159                                 
160                                 //save names in case no namefile is given
161                                 if (namefile == "") {  names.insert(query.getName()); }
162                         
163                                 if (it != seqGroup.end()) { //not singleton 
164                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
165                                         query.printSequence(outFile); 
166                                         outFile.close();
167                                         
168                                         copyGroups.erase(query.getName());
169                                 }
170                         }
171                 }
172                 in.close();
173                 
174                 //warn about sequence in groups that are not in fasta file
175                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
176                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
177                         exit(1);
178                 }
179                 
180                 copyGroups.clear();
181                 
182                 //process each distance file
183                 for (int i = 0; i < numGroups; i++) { 
184                         
185                         string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff);
186                         
187                         Command* command = new DistanceCommand(options);
188                         command->execute();
189                         delete command;
190                         
191                         remove((fastafile + "." + toString(i) + ".temp").c_str());
192                         
193                         //remove old names files just in case
194                         remove((namefile + "." + toString(i) + ".temp").c_str());
195                 }
196                 
197                 singleton = namefile + ".extra.temp";
198                 ofstream remainingNames;
199                 m->openOutputFile(singleton, remainingNames);
200                 
201                 bool wroteExtra = false;
202
203                 ifstream bigNameFile;
204                 m->openInputFile(namefile, bigNameFile);
205                 
206                 string name, nameList;
207                 while(!bigNameFile.eof()){
208                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
209                         
210                         //did this sequence get assigned a group
211                         it = seqGroup.find(name);
212                         
213                         if (it != seqGroup.end()) {  
214                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
215                                 outFile << name << '\t' << nameList << endl;
216                                 outFile.close();
217                         }else{
218                                 wroteExtra = true;
219                                 remainingNames << name << '\t' << nameList << endl;
220                         }
221                 }
222                 bigNameFile.close();
223                 
224                 for(int i=0;i<numGroups;i++){
225                         string tempNameFile = namefile + "." + toString(i) + ".temp";
226                         if (outputDir == "") { outputDir = m->hasPath(fastafile); }
227                         string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
228
229                         //if there are valid distances
230                         ifstream fileHandle;
231                         fileHandle.open(tempDistFile.c_str());
232                         if(fileHandle)  {       
233                                 m->gobble(fileHandle);
234                                 if (!fileHandle.eof()) {  //check for blank file - this could occur if all dists in group are above cutoff
235                                         map<string, string> temp;
236                                         temp[tempDistFile] = tempNameFile;
237                                         dists.push_back(temp);
238                                 }else {
239                                         ifstream in;
240                                         m->openInputFile(tempNameFile, in);
241                                 
242                                         while(!in.eof()) { 
243                                                 in >> name >> nameList;  m->gobble(in);
244                                                 wroteExtra = true;
245                                                 remainingNames << name << '\t' << nameList << endl;
246                                         }
247                                         in.close();
248                                         remove(tempNameFile.c_str());
249                                 }
250                         }
251                         fileHandle.close();
252                 }
253                 
254                 remainingNames.close();
255                 if (!wroteExtra) { 
256                         remove(singleton.c_str());
257                         singleton = "none";
258                 }
259
260                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
261                 
262                 return 0;
263         }
264         catch(exception& e) {
265                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
266                 exit(1);
267         }
268 }
269 /***********************************************************************/
270 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
271         try {
272                 map<string, int>::iterator it;
273                 map<string, int>::iterator it2;
274                 
275                 ifstream dFile;
276                 m->openInputFile(distFile, dFile);
277                 ofstream outFile;
278                 
279                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
280                         remove((distFile + "." + toString(i) + ".temp").c_str());
281                 }
282                 
283                 //for buffering the io to improve speed
284                  //allow for 10 dists to be stored, then output.
285                 vector<string> outputs;  outputs.resize(numGroups, "");
286                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
287                 
288                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
289                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
290                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
291                 
292                 //for each distance
293                 while(dFile){
294                         string seqA, seqB;
295                         float dist;
296                         
297                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
298                         
299                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
300                         
301                         //if both sequences are in the same group then they are within the cutoff
302                         it = seqGroup.find(seqA);
303                         it2 = seqGroup.find(seqB);
304                         
305                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
306                                 if (it->second == it2->second) { //they are from the same group so add the distance
307                                         if (numOutputs[it->second] > 30) {
308                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
309                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
310                                                 outFile.close();
311                                                 outputs[it->second] = "";
312                                                 numOutputs[it->second] = 0;
313                                                 validDistances[it->second] = true;
314                                         }else{
315                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
316                                                 numOutputs[it->second]++;
317                                         }
318                                 }
319                         }
320                 }
321                 dFile.close();
322         
323                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
324                         remove((namefile + "." + toString(i) + ".temp").c_str());
325                         
326                         //write out any remaining buffers
327                         if (numOutputs[i] > 0) {
328                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
329                                 outFile << outputs[i];
330                                 outFile.close();
331                                 outputs[i] = "";
332                                 numOutputs[i] = 0;
333                                 validDistances[i] = true;
334                         }
335                 }
336                 
337                 ifstream bigNameFile;
338                 m->openInputFile(namefile, bigNameFile);
339                 
340                 singleton = namefile + ".extra.temp";
341                 ofstream remainingNames;
342                 m->openOutputFile(singleton, remainingNames);
343                 
344                 bool wroteExtra = false;
345                                                 
346                 string name, nameList;
347                 while(!bigNameFile.eof()){
348                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
349                         
350                         //did this sequence get assigned a group
351                         it = seqGroup.find(name);
352                         
353                         if (it != seqGroup.end()) {  
354                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
355                                 outFile << name << '\t' << nameList << endl;
356                                 outFile.close();
357                         }else{
358                                 wroteExtra = true;
359                                 remainingNames << name << '\t' << nameList << endl;
360                         }
361                 }
362                 bigNameFile.close();
363                                 
364                 for(int i=0;i<numGroups;i++){
365                         string tempNameFile = namefile + "." + toString(i) + ".temp";
366                         string tempDistFile = distFile + "." + toString(i) + ".temp";
367
368                         //if there are valid distances
369                         if (validDistances[i]) {
370                                 map<string, string> temp;
371                                 temp[tempDistFile] = tempNameFile;
372                                 dists.push_back(temp);
373                         }else{
374                                 ifstream in;
375                                 m->openInputFile(tempNameFile, in);
376                                 
377                                 while(!in.eof()) { 
378                                         in >> name >> nameList;  m->gobble(in);
379                                         wroteExtra = true;
380                                         remainingNames << name << '\t' << nameList << endl;
381                                 }
382                                 in.close();
383                                 remove(tempNameFile.c_str());
384                         }
385                 }
386                 
387                 remainingNames.close();
388                 
389                 if (!wroteExtra) { 
390                         remove(singleton.c_str());
391                         singleton = "none";
392                 }
393
394                 if (m->control_pressed)  {  
395                         for (int i = 0; i < dists.size(); i++) { 
396                                 remove((dists[i].begin()->first).c_str());
397                                 remove((dists[i].begin()->second).c_str());
398                         }
399                         dists.clear();
400                 }
401                 
402                 return 0;
403         }
404         catch(exception& e) {
405                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
406                 exit(1);
407         }
408 }
409 /***********************************************************************/
410 int SplitMatrix::splitDistanceLarge(){
411         try {
412                 vector<set<string> > groups;
413                 
414                 //for buffering the io to improve speed
415                  //allow for 30 dists to be stored, then output.
416                 vector<string> outputs;
417                 vector<int> numOutputs;
418                 vector<bool> wroteOutPut;
419                 
420                 int numGroups = 0;
421
422                 ofstream outFile;
423                 ifstream dFile;
424                 m->openInputFile(distFile, dFile);
425         
426                 while(dFile){
427                         string seqA, seqB;
428                         float dist;
429
430                         dFile >> seqA >> seqB >> dist;
431                         
432                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
433                                         
434                         if(dist < cutoff){
435                                 //cout << "in cutoff: " << dist << endl;
436                                 int groupIDA = -1;
437                                 int groupIDB = -1;
438                                 int groupID = -1;
439                                 
440                                 for(int i=0;i<numGroups;i++){
441                                         set<string>::iterator aIt = groups[i].find(seqA);
442                                         set<string>::iterator bIt = groups[i].find(seqB);
443                                         
444                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
445                                                 groups[i].insert(seqB);
446                                                 groupIDA = i;
447                                                 groupID = groupIDA;
448
449                                                 //cout << "in aIt: " << groupID << endl;
450         //                                      break;
451                                         }
452                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
453                                                 groups[i].insert(seqA);
454                                                 groupIDB = i;
455                                                 groupID = groupIDB;
456
457                                         //      cout << "in bIt: " << groupID << endl;
458         //                                      break;
459                                         }
460                                 
461                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
462                                                 if(groupIDA < groupIDB){
463                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
464                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
465                                                         groups[groupIDB].clear(); 
466                                                         groupID = groupIDA;
467                                                 }
468                                                 else{
469                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
470                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
471                                                         groups[groupIDA].clear();  
472                                                         groupID = groupIDB;
473                                                 }
474                                                 break;
475                                         }
476                                 }
477                                 
478         //windows is gonna gag on the reuse of outFile, will need to make it local...
479                                 
480                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
481                                         set<string> newGroup;
482                                         newGroup.insert(seqA);
483                                         newGroup.insert(seqB);
484                                         groups.push_back(newGroup);
485                                                                         
486                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
487                                         outputs.push_back(tempOut);
488                                         numOutputs.push_back(1);
489                                         wroteOutPut.push_back(false);
490                                         
491                                         numGroups++;
492                                 }
493                                 else{
494                                         string fileName = distFile + "." + toString(groupID) + ".temp";
495                                                                                         
496                                         //have we reached the max buffer size
497                                         if (numOutputs[groupID] > 60) { //write out sequence
498                                                 outFile.open(fileName.c_str(), ios::app);
499                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
500                                                 outFile.close();
501                                                 
502                                                 outputs[groupID] = "";
503                                                 numOutputs[groupID] = 0;
504                                                 wroteOutPut[groupID] = true;
505                                         }else {
506                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
507                                                 numOutputs[groupID]++;
508                                         }
509                                         
510                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
511                                                 string row, column, distance;
512                                                 if(groupIDA<groupIDB){
513                                                         
514                                                         //merge memory
515                                                         numOutputs[groupID] += numOutputs[groupIDB];
516                                                         outputs[groupID] += outputs[groupIDB];
517                                                         
518                                                         outputs[groupIDB] = "";
519                                                         numOutputs[groupIDB] = 0;
520                                                         
521                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
522                                                         if (wroteOutPut[groupIDB]) {
523                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
524                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
525                                                                 
526                                                                 outFile.open(fileName.c_str(), ios::app);
527                                                                 
528                                                                 long size;
529                                                                 char* memblock;
530
531                                                                 size = fileB.tellg();
532                                 
533                                                                 fileB.seekg (0, ios::beg);
534                                                                 
535                                                                 int numRead = size / 1024;
536                                                                 int lastRead = size % 1024;
537
538                                                                 for (int i = 0; i < numRead; i++) {
539                                 
540                                                                         memblock = new char [1024];
541                                                                 
542                                                                         fileB.read (memblock, 1024);
543                                                                         
544                                                                         string temp = memblock;
545                                                                         outFile << temp.substr(0, 1024);
546                                                                         
547                                                                         delete memblock;
548                                                                 }
549                                                                 
550                                                                 memblock = new char [lastRead];
551                                                                 
552                                                                 fileB.read (memblock, lastRead);
553                                                                 
554                                                                 //not sure why but it will read more than lastRead char...??
555                                                                 string temp = memblock;
556                                                                 outFile << temp.substr(0, lastRead);
557                                                                 delete memblock;
558                                                                 
559                                                                 fileB.close();
560                                                                 remove(fileName2.c_str());
561                                                                 
562                                                                 //write out the merged memory
563                                                                 if (numOutputs[groupID] > 60) {
564                                                                         outFile << outputs[groupID];
565                                                                         outputs[groupID] = "";
566                                                                         numOutputs[groupID] = 0;
567                                                                 }
568                                                                 
569                                                                 outFile.close();
570                                                                 
571                                                                 wroteOutPut[groupID] = true;
572                                                                 wroteOutPut[groupIDB] = false;
573                                                         }else{ } //just merge b's memory with a's memory 
574                                                 }
575                                                 else{
576                                                         numOutputs[groupID] += numOutputs[groupIDA];
577                                                         outputs[groupID] += outputs[groupIDA];
578                                                         
579                                                         outputs[groupIDA] = "";
580                                                         numOutputs[groupIDA] = 0;
581                                                         
582                                                         if (wroteOutPut[groupIDA]) {
583                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
584                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
585                                                                 
586                                                                 outFile.open(fileName.c_str(), ios::app);
587                                                                 
588                                                                 long size;
589                                                                 char* memblock;
590
591                                                                 size = fileB.tellg();
592                                                                                                                         
593                                                                 fileB.seekg (0, ios::beg);
594                                                                 
595                                                                 int numRead = size / 1024;
596                                                                 int lastRead = size % 1024;
597
598                                                                 for (int i = 0; i < numRead; i++) {
599                                 
600                                                                         memblock = new char [1024];
601                                                                 
602                                                                         fileB.read (memblock, 1024);
603                                                                         string temp = memblock;
604                                                                         outFile << temp.substr(0, 1024);
605                                                                         
606                                                                         delete memblock;
607                                                                 }
608                                                                 
609                                                                 memblock = new char [lastRead];
610                                                                 
611                                                                 fileB.read (memblock, lastRead);
612                                                                 
613                                                                 //not sure why but it will read more than lastRead char...??
614                                                                 string temp = memblock;
615                                                                 outFile << temp.substr(0, lastRead);
616                                                                         
617                                                                 delete memblock;
618                                                                 
619                                                                 fileB.close();
620                                                                 remove(fileName2.c_str());
621                                                                 
622                                                                 //write out the merged memory
623                                                                 if (numOutputs[groupID] > 60) {
624                                                                         outFile << outputs[groupID];
625                                                                         outputs[groupID] = "";
626                                                                         numOutputs[groupID] = 0;
627                                                                 }
628                                                                 
629                                                                 outFile.close();
630                                                                 
631                                                                 wroteOutPut[groupID] = true;
632                                                                 wroteOutPut[groupIDA] = false;
633                                                         }else { } //just merge memory
634                                                 }                                       
635                                         }
636                                 }
637                         }
638                         m->gobble(dFile);
639                 }
640                 dFile.close();
641                 
642                 for (int i = 0; i < numGroups; i++) {
643                         if (numOutputs[i] > 0) {
644                                 string fileName = distFile + "." + toString(i) + ".temp";
645                                 outFile.open(fileName.c_str(), ios::app);
646                                 outFile << outputs[i];
647                                 outFile.close();
648                         }
649                 }
650
651                 splitNames(groups);
652                                 
653                 return 0;                       
654         }
655         catch(exception& e) {
656                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
657                 exit(1);
658         }
659 }
660 //********************************************************************************************************************
661 int SplitMatrix::splitNames(vector<set<string> >& groups){
662         try {
663                 int numGroups = groups.size();
664         
665                 ifstream bigNameFile(namefile.c_str());
666                 if(!bigNameFile){
667                         cerr << "Error: We can't open the name file\n";
668                         exit(1);
669                 }
670                 
671                 map<string, string> nameMap;
672                 string name, nameList;
673                 while(bigNameFile){
674                         bigNameFile >> name >> nameList;
675                         nameMap[name] = nameList;
676                         m->gobble(bigNameFile);
677                 }
678                 bigNameFile.close();
679                         
680                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
681                         int numSeqsInGroup = groups[i].size();
682                         
683                         if(numSeqsInGroup > 0){
684                                 string fileName = namefile + "." + toString(i) + ".temp";
685                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
686                                 
687                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
688                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
689                                         if (nIt != nameMap.end()) {
690                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
691                                                 nameMap.erase(nIt);
692                                         }else{
693                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
694                                         }
695                                 }
696                                 smallNameFile.close();
697                         }
698                 }
699                 
700                 //names of singletons
701                 if (nameMap.size() != 0) {
702                         singleton = namefile + ".extra.temp";
703                         ofstream remainingNames(singleton.c_str(), ios::ate);
704                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
705                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
706                         }
707                         remainingNames.close();
708                 }else { singleton = "none"; }
709                         
710                 for(int i=0;i<numGroups;i++){
711                         if(groups[i].size() > 0){
712                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
713                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
714                                 
715                                 map<string, string> temp;
716                                 temp[tempDistFile] = tempNameFile;
717                                 dists.push_back(temp);
718                         }
719                 }
720                 
721                 if (m->control_pressed)  {  
722                         for (int i = 0; i < dists.size(); i++) { 
723                                 remove((dists[i].begin()->first).c_str());
724                                 remove((dists[i].begin()->second).c_str());
725                         }
726                         dists.clear();
727                 }
728                 
729                 return 0;
730         }
731         catch(exception& e) {
732                 m->errorOut(e, "SplitMatrix", "splitNames");
733                 exit(1);
734         }
735 }
736 //********************************************************************************************************************
737 int SplitMatrix::splitDistanceRAM(){
738         try {
739                 vector<set<string> > groups;
740                 vector<string> outputs;
741                 
742                 int numGroups = 0;
743
744                 ifstream dFile;
745                 m->openInputFile(distFile, dFile);
746
747                 while(dFile){
748                         string seqA, seqB;
749                         float dist;
750
751                         dFile >> seqA >> seqB >> dist;
752                         
753                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
754                                         
755                         if(dist < cutoff){
756                                 //cout << "in cutoff: " << dist << endl;
757                                 int groupIDA = -1;
758                                 int groupIDB = -1;
759                                 int groupID = -1;
760                                 
761                                 for(int i=0;i<numGroups;i++){
762                                         set<string>::iterator aIt = groups[i].find(seqA);
763                                         set<string>::iterator bIt = groups[i].find(seqB);
764                                         
765                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
766                                                 groups[i].insert(seqB);
767                                                 groupIDA = i;
768                                                 groupID = groupIDA;
769
770                                                 //cout << "in aIt: " << groupID << endl;
771         //                                      break;
772                                         }
773                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
774                                                 groups[i].insert(seqA);
775                                                 groupIDB = i;
776                                                 groupID = groupIDB;
777
778                                         //      cout << "in bIt: " << groupID << endl;
779         //                                      break;
780                                         }
781                                 
782                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
783                                                 if(groupIDA < groupIDB){
784                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
785                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
786                                                         groups[groupIDB].clear(); 
787                                                         groupID = groupIDA;
788                                                 }
789                                                 else{
790                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
791                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
792                                                         groups[groupIDA].clear();  
793                                                         groupID = groupIDB;
794                                                 }
795                                                 break;
796                                         }
797                                 }
798                                 
799         //windows is gonna gag on the reuse of outFile, will need to make it local...
800                                 
801                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
802                                         set<string> newGroup;
803                                         newGroup.insert(seqA);
804                                         newGroup.insert(seqB);
805                                         groups.push_back(newGroup);
806                                                                         
807                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
808                                         outputs.push_back(tempOut);
809                                         numGroups++;
810                                 }
811                                 else{
812                                                                                         
813                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
814                                         
815                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
816                                                 string row, column, distance;
817                                                 if(groupIDA<groupIDB){
818                                                         //merge memory
819                                                         outputs[groupID] += outputs[groupIDB];
820                                                         outputs[groupIDB] = "";
821                                                 }else{
822                                                         outputs[groupID] += outputs[groupIDA];
823                                                         outputs[groupIDA] = "";
824                                                 }                                       
825                                         }
826                                 }
827                         }
828                         m->gobble(dFile);
829                 }
830                 dFile.close();
831                 
832                 for (int i = 0; i < numGroups; i++) {
833                         if (outputs[i] != "") {
834                                 ofstream outFile;
835                                 string fileName = distFile + "." + toString(i) + ".temp";
836                                 outFile.open(fileName.c_str(), ios::ate);
837                                 outFile << outputs[i];
838                                 outFile.close();
839                         }
840                 }
841
842                 splitNames(groups);
843                                 
844                 return 0;                       
845         }
846         catch(exception& e) {
847                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
848                 exit(1);
849         }
850 }
851 //********************************************************************************************************************
852 //sorts biggest to smallest
853 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
854         
855         FILE * pFile;
856         long leftsize = 0;
857                 
858         //get num bytes in file
859         string filename = left.begin()->first;
860         pFile = fopen (filename.c_str(),"rb");
861         string error = "Error opening " + filename;
862         if (pFile==NULL) perror (error.c_str());
863         else{
864                 fseek (pFile, 0, SEEK_END);
865                 leftsize=ftell (pFile);
866                 fclose (pFile);
867         }
868
869         FILE * pFile2;
870         long rightsize = 0;
871                 
872         //get num bytes in file
873         filename = right.begin()->first;
874         pFile2 = fopen (filename.c_str(),"rb");
875         error = "Error opening " + filename;
876         if (pFile2==NULL) perror (error.c_str());
877         else{
878                 fseek (pFile2, 0, SEEK_END);
879                 rightsize=ftell (pFile2);
880                 fclose (pFile2);
881         }
882
883         return (leftsize > rightsize);  
884
885 /***********************************************************************/
886 //returns map of distance files -> namefile sorted by distance file size
887 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
888         try {   
889                 
890                 sort(dists.begin(), dists.end(), compareFileSizes);
891                 
892                 return dists;
893         }
894         catch(exception& e) {
895                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
896                 exit(1);
897         }
898 }
899 /***********************************************************************/
900 SplitMatrix::~SplitMatrix(){}
901 /***********************************************************************/
902