]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
fixed bug in cluster.split
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13
14 /***********************************************************************/
15
16 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
17         m = MothurOut::getInstance();
18         distFile = distfile;
19         cutoff = c;
20         namefile = name;
21         method = t;
22         taxFile = tax;
23         large = l;
24 }
25 /***********************************************************************/
26
27 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, string t, int p){
28         m = MothurOut::getInstance();
29         fastafile = ffile;
30         namefile = name;
31         taxFile = tax;
32         cutoff = c;
33         method = t;
34         processors = p;
35 }
36
37 /***********************************************************************/
38
39 int SplitMatrix::split(){
40         try {
41         
42                 if (method == "distance") {  
43                         splitDistance();
44                 }else if ((method == "classify") || (method == "fasta")) {
45                         splitClassify();
46                 }else {
47                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
48                         map<string, string> temp;
49                         temp[distFile] = namefile;
50                         dists.push_back(temp);
51                 }
52                 
53                 return 0;
54         }
55         catch(exception& e) {
56                 m->errorOut(e, "SplitMatrix", "split");
57                 exit(1);
58         }
59 }
60 /***********************************************************************/
61 int SplitMatrix::splitDistance(){
62         try {
63         
64                 if (large)      { splitDistanceLarge(); }
65                 else            { splitDistanceRAM();   }
66                         
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "SplitMatrix", "splitDistance");
70                 exit(1);
71         }
72 }
73
74 /***********************************************************************/
75 int SplitMatrix::splitClassify(){
76         try {
77                 cutoff = int(cutoff);
78                 
79                 map<string, int> seqGroup;
80                 map<string, int>::iterator it;
81                 map<string, int>::iterator it2;
82                 
83                 int numGroups = 0;
84                 
85                 //build tree from users taxonomy file
86                 PhyloTree* phylo = new PhyloTree();
87                 
88                 ifstream in;
89                 openInputFile(taxFile, in);
90                         
91                 //read in users taxonomy file and add sequences to tree
92                 string seqname, tax;
93                 while(!in.eof()){
94                         in >> seqname >> tax; gobble(in);
95                         phylo->addSeqToTree(seqname, tax);
96                 }
97                 in.close();
98                 
99                 phylo->assignHeirarchyIDs(0);
100
101                 //make sure the cutoff is not greater than maxlevel
102                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
103         
104                 //for each node in tree
105                 for (int i = 0; i < phylo->getNumNodes(); i++) {
106                 
107                         //is this node within the cutoff
108                         TaxNode taxon = phylo->get(i);
109         
110                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
111                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
112                                         for (int j = 0; j < taxon.accessions.size(); j++) {
113                                                 seqGroup[taxon.accessions[j]] = numGroups;
114                                         }
115                                         numGroups++;
116                                 }
117                         }
118                 }
119         
120                 delete phylo;
121                 
122                 if (method == "classify") {
123                         splitDistanceFileByTax(seqGroup, numGroups);
124                 }else {
125                         createDistanceFilesFromTax(seqGroup, numGroups);
126                 }
127                 
128                 return 0;
129                         
130         }
131         catch(exception& e) {
132                 m->errorOut(e, "SplitMatrix", "splitClassify");
133                 exit(1);
134         }
135 }
136 /***********************************************************************/
137 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
138         try {
139                 map<string, int> copyGroups = seqGroup;
140                 map<string, int>::iterator it;
141                 set<string> names;
142                                 
143                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
144                         remove((fastafile + "." + toString(i) + ".temp").c_str());
145                 }
146                         
147                 ifstream in;
148                 openInputFile(fastafile, in);
149         
150                 //parse fastafile
151                 ofstream outFile;
152                 while (!in.eof()) {
153                         Sequence query(in); gobble(in);
154                         if (query.getName() != "") {
155                 
156                                 it = seqGroup.find(query.getName());
157                                 
158                                 //save names in case no namefile is given
159                                 if (namefile == "") {  names.insert(query.getName()); }
160                         
161                                 if (it != seqGroup.end()) { //not singleton 
162                                         openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
163                                         query.printSequence(outFile); 
164                                         outFile.close();
165                                         
166                                         copyGroups.erase(it);
167                                 }
168                         }
169                 }
170                 in.close();
171                 
172                 //warn about sequence in groups that are not in fasta file
173                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
174                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
175                         exit(1);
176                 }
177                 
178                 copyGroups.clear();
179                 
180                 //process each distance file
181                 for (int i = 0; i < numGroups; i++) { 
182                         
183                         string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors);
184                         
185                         Command* command = new DistanceCommand(options);
186                         command->execute();
187                         delete command;
188                         
189                         remove((fastafile + "." + toString(i) + ".temp").c_str());
190                         
191                         //remove old names files just in case
192                         remove((namefile + "." + toString(i) + ".temp").c_str());
193                 }
194                 
195                 singleton = namefile + ".extra.temp";
196                 ofstream remainingNames;
197                 openOutputFile(singleton, remainingNames);
198                 
199                 bool wroteExtra = false;
200
201                 ifstream bigNameFile;
202                 openInputFile(namefile, bigNameFile);
203                 
204                 string name, nameList;
205                 while(!bigNameFile.eof()){
206                         bigNameFile >> name >> nameList;  gobble(bigNameFile);
207                         
208                         //did this sequence get assigned a group
209                         it = seqGroup.find(name);
210                         
211                         if (it != seqGroup.end()) {  
212                                 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
213                                 outFile << name << '\t' << nameList << endl;
214                                 outFile.close();
215                         }else{
216                                 wroteExtra = true;
217                                 remainingNames << name << '\t' << nameList << endl;
218                         }
219                 }
220                 bigNameFile.close();
221                 
222                 remainingNames.close();
223                 if (!wroteExtra) { 
224                         remove(singleton.c_str());
225                         singleton = "none";
226                 }
227
228                 for(int i=0;i<numGroups;i++){
229                         string tempNameFile = namefile + "." + toString(i) + ".temp";
230                         string tempDistFile = getRootName(getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
231
232                         //if there are valid distances
233                         ifstream fileHandle;
234                         fileHandle.open(tempDistFile.c_str());
235                         if(fileHandle)  {       
236                                 gobble(fileHandle);
237                                 if (!fileHandle.eof()) {  //check for blank file
238                                         map<string, string> temp;
239                                         temp[tempDistFile] = tempNameFile;
240                                         dists.push_back(temp);
241                                 }
242                         }
243                         fileHandle.close();
244                 }
245                 
246                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
247                 
248                 return 0;
249         }
250         catch(exception& e) {
251                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
252                 exit(1);
253         }
254 }
255 /***********************************************************************/
256 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
257         try {
258                 map<string, int>::iterator it;
259                 map<string, int>::iterator it2;
260                 
261                 ifstream dFile;
262                 openInputFile(distFile, dFile);
263                 ofstream outFile;
264                 
265                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
266                         remove((distFile + "." + toString(i) + ".temp").c_str());
267                 }
268                 
269                 //for buffering the io to improve speed
270                  //allow for 10 dists to be stored, then output.
271                 vector<string> outputs;  outputs.resize(numGroups, "");
272                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
273                 
274                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
275                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
276                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
277                 
278                 //for each distance
279                 while(dFile){
280                         string seqA, seqB;
281                         float dist;
282                         
283                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
284                         
285                         dFile >> seqA >> seqB >> dist;  gobble(dFile);
286                         
287                         //if both sequences are in the same group then they are within the cutoff
288                         it = seqGroup.find(seqA);
289                         it2 = seqGroup.find(seqB);
290                         
291                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
292                                 if (it->second == it2->second) { //they are from the same group so add the distance
293                                         if (numOutputs[it->second] > 30) {
294                                                 openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
295                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
296                                                 outFile.close();
297                                                 outputs[it->second] = "";
298                                                 numOutputs[it->second] = 0;
299                                                 validDistances[it->second] = true;
300                                         }else{
301                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
302                                                 numOutputs[it->second]++;
303                                         }
304                                 }
305                         }
306                 }
307                 dFile.close();
308         
309                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
310                         remove((namefile + "." + toString(i) + ".temp").c_str());
311                         
312                         //write out any remaining buffers
313                         if (numOutputs[i] > 0) {
314                                 openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
315                                 outFile << outputs[i];
316                                 outFile.close();
317                                 outputs[i] = "";
318                                 numOutputs[i] = 0;
319                                 validDistances[i] = true;
320                         }
321                 }
322                 
323                 ifstream bigNameFile;
324                 openInputFile(namefile, bigNameFile);
325                 
326                 singleton = namefile + ".extra.temp";
327                 ofstream remainingNames;
328                 openOutputFile(singleton, remainingNames);
329                 
330                 bool wroteExtra = false;
331                                                 
332                 string name, nameList;
333                 while(!bigNameFile.eof()){
334                         bigNameFile >> name >> nameList;  gobble(bigNameFile);
335                         
336                         //did this sequence get assigned a group
337                         it = seqGroup.find(name);
338                         
339                         if (it != seqGroup.end()) {  
340                                 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
341                                 outFile << name << '\t' << nameList << endl;
342                                 outFile.close();
343                         }else{
344                                 wroteExtra = true;
345                                 remainingNames << name << '\t' << nameList << endl;
346                         }
347                 }
348                 bigNameFile.close();
349                                 
350                 for(int i=0;i<numGroups;i++){
351                         string tempNameFile = namefile + "." + toString(i) + ".temp";
352                         string tempDistFile = distFile + "." + toString(i) + ".temp";
353
354                         //if there are valid distances
355                         if (validDistances[i]) {
356                                 map<string, string> temp;
357                                 temp[tempDistFile] = tempNameFile;
358                                 dists.push_back(temp);
359                         }else{
360                                 ifstream in;
361                                 openInputFile(tempNameFile, in);
362                                 
363                                 while(!in.eof()) { 
364                                         in >> name >> nameList;  gobble(in);
365                                         wroteExtra = true;
366                                         remainingNames << name << '\t' << nameList << endl;
367                                 }
368                                 in.close();
369                                 remove(tempNameFile.c_str());
370                         }
371                 }
372                 
373                 remainingNames.close();
374                 
375                 if (!wroteExtra) { 
376                         remove(singleton.c_str());
377                         singleton = "none";
378                 }
379
380                 if (m->control_pressed)  {  
381                         for (int i = 0; i < dists.size(); i++) { 
382                                 remove((dists[i].begin()->first).c_str());
383                                 remove((dists[i].begin()->second).c_str());
384                         }
385                         dists.clear();
386                 }
387                 
388                 return 0;
389         }
390         catch(exception& e) {
391                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
392                 exit(1);
393         }
394 }
395 /***********************************************************************/
396 int SplitMatrix::splitDistanceLarge(){
397         try {
398                 vector<set<string> > groups;
399                 
400                 //for buffering the io to improve speed
401                  //allow for 30 dists to be stored, then output.
402                 vector<string> outputs;
403                 vector<int> numOutputs;
404                 vector<bool> wroteOutPut;
405                 
406                 int numGroups = 0;
407
408                 ofstream outFile;
409                 ifstream dFile;
410                 openInputFile(distFile, dFile);
411         
412                 while(dFile){
413                         string seqA, seqB;
414                         float dist;
415
416                         dFile >> seqA >> seqB >> dist;
417                         
418                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
419                                         
420                         if(dist < cutoff){
421                                 //cout << "in cutoff: " << dist << endl;
422                                 int groupIDA = -1;
423                                 int groupIDB = -1;
424                                 int groupID = -1;
425                                 
426                                 for(int i=0;i<numGroups;i++){
427                                         set<string>::iterator aIt = groups[i].find(seqA);
428                                         set<string>::iterator bIt = groups[i].find(seqB);
429                                         
430                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
431                                                 groups[i].insert(seqB);
432                                                 groupIDA = i;
433                                                 groupID = groupIDA;
434
435                                                 //cout << "in aIt: " << groupID << endl;
436         //                                      break;
437                                         }
438                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
439                                                 groups[i].insert(seqA);
440                                                 groupIDB = i;
441                                                 groupID = groupIDB;
442
443                                         //      cout << "in bIt: " << groupID << endl;
444         //                                      break;
445                                         }
446                                 
447                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
448                                                 if(groupIDA < groupIDB){
449                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
450                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
451                                                         groups[groupIDB].clear(); 
452                                                         groupID = groupIDA;
453                                                 }
454                                                 else{
455                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
456                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
457                                                         groups[groupIDA].clear();  
458                                                         groupID = groupIDB;
459                                                 }
460                                                 break;
461                                         }
462                                 }
463                                 
464         //windows is gonna gag on the reuse of outFile, will need to make it local...
465                                 
466                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
467                                         set<string> newGroup;
468                                         newGroup.insert(seqA);
469                                         newGroup.insert(seqB);
470                                         groups.push_back(newGroup);
471                                                                         
472                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
473                                         outputs.push_back(tempOut);
474                                         numOutputs.push_back(1);
475                                         wroteOutPut.push_back(false);
476                                         
477                                         numGroups++;
478                                 }
479                                 else{
480                                         string fileName = distFile + "." + toString(groupID) + ".temp";
481                                                                                         
482                                         //have we reached the max buffer size
483                                         if (numOutputs[groupID] > 60) { //write out sequence
484                                                 outFile.open(fileName.c_str(), ios::app);
485                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
486                                                 outFile.close();
487                                                 
488                                                 outputs[groupID] = "";
489                                                 numOutputs[groupID] = 0;
490                                                 wroteOutPut[groupID] = true;
491                                         }else {
492                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
493                                                 numOutputs[groupID]++;
494                                         }
495                                         
496                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
497                                                 string row, column, distance;
498                                                 if(groupIDA<groupIDB){
499                                                         
500                                                         //merge memory
501                                                         numOutputs[groupID] += numOutputs[groupIDB];
502                                                         outputs[groupID] += outputs[groupIDB];
503                                                         
504                                                         outputs[groupIDB] = "";
505                                                         numOutputs[groupIDB] = 0;
506                                                         
507                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
508                                                         if (wroteOutPut[groupIDB]) {
509                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
510                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
511                                                                 
512                                                                 outFile.open(fileName.c_str(), ios::app);
513                                                                 
514                                                                 long size;
515                                                                 char* memblock;
516
517                                                                 size = fileB.tellg();
518                                 
519                                                                 fileB.seekg (0, ios::beg);
520                                                                 
521                                                                 int numRead = size / 1024;
522                                                                 int lastRead = size % 1024;
523
524                                                                 for (int i = 0; i < numRead; i++) {
525                                 
526                                                                         memblock = new char [1024];
527                                                                 
528                                                                         fileB.read (memblock, 1024);
529                                                                         
530                                                                         string temp = memblock;
531                                                                         outFile << temp.substr(0, 1024);
532                                                                         
533                                                                         delete memblock;
534                                                                 }
535                                                                 
536                                                                 memblock = new char [lastRead];
537                                                                 
538                                                                 fileB.read (memblock, lastRead);
539                                                                 
540                                                                 //not sure why but it will read more than lastRead char...??
541                                                                 string temp = memblock;
542                                                                 outFile << temp.substr(0, lastRead);
543                                                                 delete memblock;
544                                                                 
545                                                                 fileB.close();
546                                                                 remove(fileName2.c_str());
547                                                                 
548                                                                 //write out the merged memory
549                                                                 if (numOutputs[groupID] > 60) {
550                                                                         outFile << outputs[groupID];
551                                                                         outputs[groupID] = "";
552                                                                         numOutputs[groupID] = 0;
553                                                                 }
554                                                                 
555                                                                 outFile.close();
556                                                                 
557                                                                 wroteOutPut[groupID] = true;
558                                                                 wroteOutPut[groupIDB] = false;
559                                                         }else{ } //just merge b's memory with a's memory 
560                                                 }
561                                                 else{
562                                                         numOutputs[groupID] += numOutputs[groupIDA];
563                                                         outputs[groupID] += outputs[groupIDA];
564                                                         
565                                                         outputs[groupIDA] = "";
566                                                         numOutputs[groupIDA] = 0;
567                                                         
568                                                         if (wroteOutPut[groupIDA]) {
569                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
570                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
571                                                                 
572                                                                 outFile.open(fileName.c_str(), ios::app);
573                                                                 
574                                                                 long size;
575                                                                 char* memblock;
576
577                                                                 size = fileB.tellg();
578                                                                                                                         
579                                                                 fileB.seekg (0, ios::beg);
580                                                                 
581                                                                 int numRead = size / 1024;
582                                                                 int lastRead = size % 1024;
583
584                                                                 for (int i = 0; i < numRead; i++) {
585                                 
586                                                                         memblock = new char [1024];
587                                                                 
588                                                                         fileB.read (memblock, 1024);
589                                                                         string temp = memblock;
590                                                                         outFile << temp.substr(0, 1024);
591                                                                         
592                                                                         delete memblock;
593                                                                 }
594                                                                 
595                                                                 memblock = new char [lastRead];
596                                                                 
597                                                                 fileB.read (memblock, lastRead);
598                                                                 
599                                                                 //not sure why but it will read more than lastRead char...??
600                                                                 string temp = memblock;
601                                                                 outFile << temp.substr(0, lastRead);
602                                                                         
603                                                                 delete memblock;
604                                                                 
605                                                                 fileB.close();
606                                                                 remove(fileName2.c_str());
607                                                                 
608                                                                 //write out the merged memory
609                                                                 if (numOutputs[groupID] > 60) {
610                                                                         outFile << outputs[groupID];
611                                                                         outputs[groupID] = "";
612                                                                         numOutputs[groupID] = 0;
613                                                                 }
614                                                                 
615                                                                 outFile.close();
616                                                                 
617                                                                 wroteOutPut[groupID] = true;
618                                                                 wroteOutPut[groupIDA] = false;
619                                                         }else { } //just merge memory
620                                                 }                                       
621                                         }
622                                 }
623                         }
624                         gobble(dFile);
625                 }
626                 dFile.close();
627                 
628                 for (int i = 0; i < numGroups; i++) {
629                         if (numOutputs[i] > 0) {
630                                 string fileName = distFile + "." + toString(i) + ".temp";
631                                 outFile.open(fileName.c_str(), ios::app);
632                                 outFile << outputs[i];
633                                 outFile.close();
634                         }
635                 }
636
637                 splitNames(groups);
638                                 
639                 return 0;                       
640         }
641         catch(exception& e) {
642                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
643                 exit(1);
644         }
645 }
646 //********************************************************************************************************************
647 int SplitMatrix::splitNames(vector<set<string> >& groups){
648         try {
649                 int numGroups = groups.size();
650         
651                 ifstream bigNameFile(namefile.c_str());
652                 if(!bigNameFile){
653                         cerr << "Error: We can't open the name file\n";
654                         exit(1);
655                 }
656                 
657                 map<string, string> nameMap;
658                 string name, nameList;
659                 while(bigNameFile){
660                         bigNameFile >> name >> nameList;
661                         nameMap[name] = nameList;
662                         gobble(bigNameFile);
663                 }
664                 bigNameFile.close();
665                         
666                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
667                         int numSeqsInGroup = groups[i].size();
668                         
669                         if(numSeqsInGroup > 0){
670                                 string fileName = namefile + "." + toString(i) + ".temp";
671                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
672                                 
673                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
674                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
675                                         if (nIt != nameMap.end()) {
676                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
677                                                 nameMap.erase(nIt);
678                                         }else{
679                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
680                                         }
681                                 }
682                                 smallNameFile.close();
683                         }
684                 }
685                 
686                 //names of singletons
687                 if (nameMap.size() != 0) {
688                         singleton = namefile + ".extra.temp";
689                         ofstream remainingNames(singleton.c_str(), ios::ate);
690                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
691                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
692                         }
693                         remainingNames.close();
694                 }else { singleton = "none"; }
695                         
696                 for(int i=0;i<numGroups;i++){
697                         if(groups[i].size() > 0){
698                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
699                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
700                                 
701                                 map<string, string> temp;
702                                 temp[tempDistFile] = tempNameFile;
703                                 dists.push_back(temp);
704                         }
705                 }
706                 
707                 if (m->control_pressed)  {  
708                         for (int i = 0; i < dists.size(); i++) { 
709                                 remove((dists[i].begin()->first).c_str());
710                                 remove((dists[i].begin()->second).c_str());
711                         }
712                         dists.clear();
713                 }
714                 
715                 return 0;
716         }
717         catch(exception& e) {
718                 m->errorOut(e, "SplitMatrix", "splitNames");
719                 exit(1);
720         }
721 }
722 //********************************************************************************************************************
723 int SplitMatrix::splitDistanceRAM(){
724         try {
725                 vector<set<string> > groups;
726                 vector<string> outputs;
727                 
728                 int numGroups = 0;
729
730                 ifstream dFile;
731                 openInputFile(distFile, dFile);
732
733                 while(dFile){
734                         string seqA, seqB;
735                         float dist;
736
737                         dFile >> seqA >> seqB >> dist;
738                         
739                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
740                                         
741                         if(dist < cutoff){
742                                 //cout << "in cutoff: " << dist << endl;
743                                 int groupIDA = -1;
744                                 int groupIDB = -1;
745                                 int groupID = -1;
746                                 
747                                 for(int i=0;i<numGroups;i++){
748                                         set<string>::iterator aIt = groups[i].find(seqA);
749                                         set<string>::iterator bIt = groups[i].find(seqB);
750                                         
751                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
752                                                 groups[i].insert(seqB);
753                                                 groupIDA = i;
754                                                 groupID = groupIDA;
755
756                                                 //cout << "in aIt: " << groupID << endl;
757         //                                      break;
758                                         }
759                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
760                                                 groups[i].insert(seqA);
761                                                 groupIDB = i;
762                                                 groupID = groupIDB;
763
764                                         //      cout << "in bIt: " << groupID << endl;
765         //                                      break;
766                                         }
767                                 
768                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
769                                                 if(groupIDA < groupIDB){
770                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
771                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
772                                                         groups[groupIDB].clear(); 
773                                                         groupID = groupIDA;
774                                                 }
775                                                 else{
776                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
777                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
778                                                         groups[groupIDA].clear();  
779                                                         groupID = groupIDB;
780                                                 }
781                                                 break;
782                                         }
783                                 }
784                                 
785         //windows is gonna gag on the reuse of outFile, will need to make it local...
786                                 
787                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
788                                         set<string> newGroup;
789                                         newGroup.insert(seqA);
790                                         newGroup.insert(seqB);
791                                         groups.push_back(newGroup);
792                                                                         
793                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
794                                         outputs.push_back(tempOut);
795                                         numGroups++;
796                                 }
797                                 else{
798                                                                                         
799                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
800                                         
801                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
802                                                 string row, column, distance;
803                                                 if(groupIDA<groupIDB){
804                                                         //merge memory
805                                                         outputs[groupID] += outputs[groupIDB];
806                                                         outputs[groupIDB] = "";
807                                                 }else{
808                                                         outputs[groupID] += outputs[groupIDA];
809                                                         outputs[groupIDA] = "";
810                                                 }                                       
811                                         }
812                                 }
813                         }
814                         gobble(dFile);
815                 }
816                 dFile.close();
817                 
818                 for (int i = 0; i < numGroups; i++) {
819                         if (outputs[i] != "") {
820                                 ofstream outFile;
821                                 string fileName = distFile + "." + toString(i) + ".temp";
822                                 outFile.open(fileName.c_str(), ios::ate);
823                                 outFile << outputs[i];
824                                 outFile.close();
825                         }
826                 }
827
828                 splitNames(groups);
829                                 
830                 return 0;                       
831         }
832         catch(exception& e) {
833                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
834                 exit(1);
835         }
836 }
837 //********************************************************************************************************************
838 //sorts biggest to smallest
839 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
840         
841         FILE * pFile;
842         long leftsize = 0;
843                 
844         //get num bytes in file
845         string filename = left.begin()->first;
846         pFile = fopen (filename.c_str(),"rb");
847         string error = "Error opening " + filename;
848         if (pFile==NULL) perror (error.c_str());
849         else{
850                 fseek (pFile, 0, SEEK_END);
851                 leftsize=ftell (pFile);
852                 fclose (pFile);
853         }
854
855         FILE * pFile2;
856         long rightsize = 0;
857                 
858         //get num bytes in file
859         filename = right.begin()->first;
860         pFile2 = fopen (filename.c_str(),"rb");
861         error = "Error opening " + filename;
862         if (pFile2==NULL) perror (error.c_str());
863         else{
864                 fseek (pFile2, 0, SEEK_END);
865                 rightsize=ftell (pFile2);
866                 fclose (pFile2);
867         }
868
869         return (leftsize > rightsize);  
870
871 /***********************************************************************/
872 //returns map of distance files -> namefile sorted by distance file size
873 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
874         try {   
875                 
876                 sort(dists.begin(), dists.end(), compareFileSizes);
877                 
878                 return dists;
879         }
880         catch(exception& e) {
881                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
882                 exit(1);
883         }
884 }
885 /***********************************************************************/
886 SplitMatrix::~SplitMatrix(){}
887 /***********************************************************************/
888