]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
added distance option to summary.shared
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13
14 /***********************************************************************/
15
16 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
17         m = MothurOut::getInstance();
18         distFile = distfile;
19         cutoff = c;
20         namefile = name;
21         method = t;
22         taxFile = tax;
23         large = l;
24 }
25 /***********************************************************************/
26
27 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, string t, int p, string output){
28         m = MothurOut::getInstance();
29         fastafile = ffile;
30         namefile = name;
31         taxFile = tax;
32         cutoff = c;
33         method = t;
34         processors = p;
35         outputDir = output;
36 }
37
38 /***********************************************************************/
39
40 int SplitMatrix::split(){
41         try {
42         
43                 if (method == "distance") {  
44                         splitDistance();
45                 }else if ((method == "classify") || (method == "fasta")) {
46                         splitClassify();
47                 }else {
48                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
49                         map<string, string> temp;
50                         temp[distFile] = namefile;
51                         dists.push_back(temp);
52                 }
53                 
54                 return 0;
55         }
56         catch(exception& e) {
57                 m->errorOut(e, "SplitMatrix", "split");
58                 exit(1);
59         }
60 }
61 /***********************************************************************/
62 int SplitMatrix::splitDistance(){
63         try {
64         
65                 if (large)      { splitDistanceLarge(); }
66                 else            { splitDistanceRAM();   }
67                         
68         }
69         catch(exception& e) {
70                 m->errorOut(e, "SplitMatrix", "splitDistance");
71                 exit(1);
72         }
73 }
74
75 /***********************************************************************/
76 int SplitMatrix::splitClassify(){
77         try {
78                 cutoff = int(cutoff);
79                 
80                 map<string, int> seqGroup;
81                 map<string, int>::iterator it;
82                 map<string, int>::iterator it2;
83                 
84                 int numGroups = 0;
85                 
86                 //build tree from users taxonomy file
87                 PhyloTree* phylo = new PhyloTree();
88                 
89                 ifstream in;
90                 m->openInputFile(taxFile, in);
91                         
92                 //read in users taxonomy file and add sequences to tree
93                 string seqname, tax;
94                 while(!in.eof()){
95                         in >> seqname >> tax; m->gobble(in);
96                         phylo->addSeqToTree(seqname, tax);
97                 }
98                 in.close();
99                 
100                 phylo->assignHeirarchyIDs(0);
101
102                 //make sure the cutoff is not greater than maxlevel
103                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
104         
105                 //for each node in tree
106                 for (int i = 0; i < phylo->getNumNodes(); i++) {
107                 
108                         //is this node within the cutoff
109                         TaxNode taxon = phylo->get(i);
110         
111                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
112                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
113                                         for (int j = 0; j < taxon.accessions.size(); j++) {
114                                                 seqGroup[taxon.accessions[j]] = numGroups;
115                                         }
116                                         numGroups++;
117                                 }
118                         }
119                 }
120         
121                 delete phylo;
122                 
123                 if (method == "classify") {
124                         splitDistanceFileByTax(seqGroup, numGroups);
125                 }else {
126                         createDistanceFilesFromTax(seqGroup, numGroups);
127                 }
128                 
129                 return 0;
130                         
131         }
132         catch(exception& e) {
133                 m->errorOut(e, "SplitMatrix", "splitClassify");
134                 exit(1);
135         }
136 }
137 /***********************************************************************/
138 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
139         try {
140                 map<string, int> copyGroups = seqGroup;
141                 map<string, int>::iterator it;
142                 set<string> names;
143                                 
144                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
145                         remove((fastafile + "." + toString(i) + ".temp").c_str());
146                 }
147                         
148                 ifstream in;
149                 m->openInputFile(fastafile, in);
150         
151                 //parse fastafile
152                 ofstream outFile;
153                 while (!in.eof()) {
154                         Sequence query(in); m->gobble(in);
155                         if (query.getName() != "") {
156                 
157                                 it = seqGroup.find(query.getName());
158                                 
159                                 //save names in case no namefile is given
160                                 if (namefile == "") {  names.insert(query.getName()); }
161                         
162                                 if (it != seqGroup.end()) { //not singleton 
163                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
164                                         query.printSequence(outFile); 
165                                         outFile.close();
166                                         
167                                         copyGroups.erase(query.getName());
168                                 }
169                         }
170                 }
171                 in.close();
172                 
173                 //warn about sequence in groups that are not in fasta file
174                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
175                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
176                         exit(1);
177                 }
178                 
179                 copyGroups.clear();
180                 
181                 //process each distance file
182                 for (int i = 0; i < numGroups; i++) { 
183                         
184                         string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(cutoff);
185                         
186                         Command* command = new DistanceCommand(options);
187                         command->execute();
188                         delete command;
189                         
190                         remove((fastafile + "." + toString(i) + ".temp").c_str());
191                         
192                         //remove old names files just in case
193                         remove((namefile + "." + toString(i) + ".temp").c_str());
194                 }
195                 
196                 singleton = namefile + ".extra.temp";
197                 ofstream remainingNames;
198                 m->openOutputFile(singleton, remainingNames);
199                 
200                 bool wroteExtra = false;
201
202                 ifstream bigNameFile;
203                 m->openInputFile(namefile, bigNameFile);
204                 
205                 string name, nameList;
206                 while(!bigNameFile.eof()){
207                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
208                         
209                         //did this sequence get assigned a group
210                         it = seqGroup.find(name);
211                         
212                         if (it != seqGroup.end()) {  
213                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
214                                 outFile << name << '\t' << nameList << endl;
215                                 outFile.close();
216                         }else{
217                                 wroteExtra = true;
218                                 remainingNames << name << '\t' << nameList << endl;
219                         }
220                 }
221                 bigNameFile.close();
222                 
223                 for(int i=0;i<numGroups;i++){
224                         string tempNameFile = namefile + "." + toString(i) + ".temp";
225                         if (outputDir == "") { outputDir = m->hasPath(fastafile); }
226                         string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
227
228                         //if there are valid distances
229                         ifstream fileHandle;
230                         fileHandle.open(tempDistFile.c_str());
231                         if(fileHandle)  {       
232                                 m->gobble(fileHandle);
233                                 if (!fileHandle.eof()) {  //check for blank file - this could occur if all dists in group are above cutoff
234                                         map<string, string> temp;
235                                         temp[tempDistFile] = tempNameFile;
236                                         dists.push_back(temp);
237                                 }else {
238                                         ifstream in;
239                                         m->openInputFile(tempNameFile, in);
240                                 
241                                         while(!in.eof()) { 
242                                                 in >> name >> nameList;  m->gobble(in);
243                                                 wroteExtra = true;
244                                                 remainingNames << name << '\t' << nameList << endl;
245                                         }
246                                         in.close();
247                                         remove(tempNameFile.c_str());
248                                 }
249                         }
250                         fileHandle.close();
251                 }
252                 
253                 remainingNames.close();
254                 if (!wroteExtra) { 
255                         remove(singleton.c_str());
256                         singleton = "none";
257                 }
258
259                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
260                 
261                 return 0;
262         }
263         catch(exception& e) {
264                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
265                 exit(1);
266         }
267 }
268 /***********************************************************************/
269 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
270         try {
271                 map<string, int>::iterator it;
272                 map<string, int>::iterator it2;
273                 
274                 ifstream dFile;
275                 m->openInputFile(distFile, dFile);
276                 ofstream outFile;
277                 
278                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
279                         remove((distFile + "." + toString(i) + ".temp").c_str());
280                 }
281                 
282                 //for buffering the io to improve speed
283                  //allow for 10 dists to be stored, then output.
284                 vector<string> outputs;  outputs.resize(numGroups, "");
285                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
286                 
287                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
288                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
289                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
290                 
291                 //for each distance
292                 while(dFile){
293                         string seqA, seqB;
294                         float dist;
295                         
296                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
297                         
298                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
299                         
300                         //if both sequences are in the same group then they are within the cutoff
301                         it = seqGroup.find(seqA);
302                         it2 = seqGroup.find(seqB);
303                         
304                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
305                                 if (it->second == it2->second) { //they are from the same group so add the distance
306                                         if (numOutputs[it->second] > 30) {
307                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
308                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
309                                                 outFile.close();
310                                                 outputs[it->second] = "";
311                                                 numOutputs[it->second] = 0;
312                                                 validDistances[it->second] = true;
313                                         }else{
314                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
315                                                 numOutputs[it->second]++;
316                                         }
317                                 }
318                         }
319                 }
320                 dFile.close();
321         
322                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
323                         remove((namefile + "." + toString(i) + ".temp").c_str());
324                         
325                         //write out any remaining buffers
326                         if (numOutputs[i] > 0) {
327                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
328                                 outFile << outputs[i];
329                                 outFile.close();
330                                 outputs[i] = "";
331                                 numOutputs[i] = 0;
332                                 validDistances[i] = true;
333                         }
334                 }
335                 
336                 ifstream bigNameFile;
337                 m->openInputFile(namefile, bigNameFile);
338                 
339                 singleton = namefile + ".extra.temp";
340                 ofstream remainingNames;
341                 m->openOutputFile(singleton, remainingNames);
342                 
343                 bool wroteExtra = false;
344                                                 
345                 string name, nameList;
346                 while(!bigNameFile.eof()){
347                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
348                         
349                         //did this sequence get assigned a group
350                         it = seqGroup.find(name);
351                         
352                         if (it != seqGroup.end()) {  
353                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
354                                 outFile << name << '\t' << nameList << endl;
355                                 outFile.close();
356                         }else{
357                                 wroteExtra = true;
358                                 remainingNames << name << '\t' << nameList << endl;
359                         }
360                 }
361                 bigNameFile.close();
362                                 
363                 for(int i=0;i<numGroups;i++){
364                         string tempNameFile = namefile + "." + toString(i) + ".temp";
365                         string tempDistFile = distFile + "." + toString(i) + ".temp";
366
367                         //if there are valid distances
368                         if (validDistances[i]) {
369                                 map<string, string> temp;
370                                 temp[tempDistFile] = tempNameFile;
371                                 dists.push_back(temp);
372                         }else{
373                                 ifstream in;
374                                 m->openInputFile(tempNameFile, in);
375                                 
376                                 while(!in.eof()) { 
377                                         in >> name >> nameList;  m->gobble(in);
378                                         wroteExtra = true;
379                                         remainingNames << name << '\t' << nameList << endl;
380                                 }
381                                 in.close();
382                                 remove(tempNameFile.c_str());
383                         }
384                 }
385                 
386                 remainingNames.close();
387                 
388                 if (!wroteExtra) { 
389                         remove(singleton.c_str());
390                         singleton = "none";
391                 }
392
393                 if (m->control_pressed)  {  
394                         for (int i = 0; i < dists.size(); i++) { 
395                                 remove((dists[i].begin()->first).c_str());
396                                 remove((dists[i].begin()->second).c_str());
397                         }
398                         dists.clear();
399                 }
400                 
401                 return 0;
402         }
403         catch(exception& e) {
404                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
405                 exit(1);
406         }
407 }
408 /***********************************************************************/
409 int SplitMatrix::splitDistanceLarge(){
410         try {
411                 vector<set<string> > groups;
412                 
413                 //for buffering the io to improve speed
414                  //allow for 30 dists to be stored, then output.
415                 vector<string> outputs;
416                 vector<int> numOutputs;
417                 vector<bool> wroteOutPut;
418                 
419                 int numGroups = 0;
420
421                 ofstream outFile;
422                 ifstream dFile;
423                 m->openInputFile(distFile, dFile);
424         
425                 while(dFile){
426                         string seqA, seqB;
427                         float dist;
428
429                         dFile >> seqA >> seqB >> dist;
430                         
431                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
432                                         
433                         if(dist < cutoff){
434                                 //cout << "in cutoff: " << dist << endl;
435                                 int groupIDA = -1;
436                                 int groupIDB = -1;
437                                 int groupID = -1;
438                                 
439                                 for(int i=0;i<numGroups;i++){
440                                         set<string>::iterator aIt = groups[i].find(seqA);
441                                         set<string>::iterator bIt = groups[i].find(seqB);
442                                         
443                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
444                                                 groups[i].insert(seqB);
445                                                 groupIDA = i;
446                                                 groupID = groupIDA;
447
448                                                 //cout << "in aIt: " << groupID << endl;
449         //                                      break;
450                                         }
451                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
452                                                 groups[i].insert(seqA);
453                                                 groupIDB = i;
454                                                 groupID = groupIDB;
455
456                                         //      cout << "in bIt: " << groupID << endl;
457         //                                      break;
458                                         }
459                                 
460                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
461                                                 if(groupIDA < groupIDB){
462                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
463                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
464                                                         groups[groupIDB].clear(); 
465                                                         groupID = groupIDA;
466                                                 }
467                                                 else{
468                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
469                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
470                                                         groups[groupIDA].clear();  
471                                                         groupID = groupIDB;
472                                                 }
473                                                 break;
474                                         }
475                                 }
476                                 
477         //windows is gonna gag on the reuse of outFile, will need to make it local...
478                                 
479                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
480                                         set<string> newGroup;
481                                         newGroup.insert(seqA);
482                                         newGroup.insert(seqB);
483                                         groups.push_back(newGroup);
484                                                                         
485                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
486                                         outputs.push_back(tempOut);
487                                         numOutputs.push_back(1);
488                                         wroteOutPut.push_back(false);
489                                         
490                                         numGroups++;
491                                 }
492                                 else{
493                                         string fileName = distFile + "." + toString(groupID) + ".temp";
494                                                                                         
495                                         //have we reached the max buffer size
496                                         if (numOutputs[groupID] > 60) { //write out sequence
497                                                 outFile.open(fileName.c_str(), ios::app);
498                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
499                                                 outFile.close();
500                                                 
501                                                 outputs[groupID] = "";
502                                                 numOutputs[groupID] = 0;
503                                                 wroteOutPut[groupID] = true;
504                                         }else {
505                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
506                                                 numOutputs[groupID]++;
507                                         }
508                                         
509                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
510                                                 string row, column, distance;
511                                                 if(groupIDA<groupIDB){
512                                                         
513                                                         //merge memory
514                                                         numOutputs[groupID] += numOutputs[groupIDB];
515                                                         outputs[groupID] += outputs[groupIDB];
516                                                         
517                                                         outputs[groupIDB] = "";
518                                                         numOutputs[groupIDB] = 0;
519                                                         
520                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
521                                                         if (wroteOutPut[groupIDB]) {
522                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
523                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
524                                                                 
525                                                                 outFile.open(fileName.c_str(), ios::app);
526                                                                 
527                                                                 long size;
528                                                                 char* memblock;
529
530                                                                 size = fileB.tellg();
531                                 
532                                                                 fileB.seekg (0, ios::beg);
533                                                                 
534                                                                 int numRead = size / 1024;
535                                                                 int lastRead = size % 1024;
536
537                                                                 for (int i = 0; i < numRead; i++) {
538                                 
539                                                                         memblock = new char [1024];
540                                                                 
541                                                                         fileB.read (memblock, 1024);
542                                                                         
543                                                                         string temp = memblock;
544                                                                         outFile << temp.substr(0, 1024);
545                                                                         
546                                                                         delete memblock;
547                                                                 }
548                                                                 
549                                                                 memblock = new char [lastRead];
550                                                                 
551                                                                 fileB.read (memblock, lastRead);
552                                                                 
553                                                                 //not sure why but it will read more than lastRead char...??
554                                                                 string temp = memblock;
555                                                                 outFile << temp.substr(0, lastRead);
556                                                                 delete memblock;
557                                                                 
558                                                                 fileB.close();
559                                                                 remove(fileName2.c_str());
560                                                                 
561                                                                 //write out the merged memory
562                                                                 if (numOutputs[groupID] > 60) {
563                                                                         outFile << outputs[groupID];
564                                                                         outputs[groupID] = "";
565                                                                         numOutputs[groupID] = 0;
566                                                                 }
567                                                                 
568                                                                 outFile.close();
569                                                                 
570                                                                 wroteOutPut[groupID] = true;
571                                                                 wroteOutPut[groupIDB] = false;
572                                                         }else{ } //just merge b's memory with a's memory 
573                                                 }
574                                                 else{
575                                                         numOutputs[groupID] += numOutputs[groupIDA];
576                                                         outputs[groupID] += outputs[groupIDA];
577                                                         
578                                                         outputs[groupIDA] = "";
579                                                         numOutputs[groupIDA] = 0;
580                                                         
581                                                         if (wroteOutPut[groupIDA]) {
582                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
583                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
584                                                                 
585                                                                 outFile.open(fileName.c_str(), ios::app);
586                                                                 
587                                                                 long size;
588                                                                 char* memblock;
589
590                                                                 size = fileB.tellg();
591                                                                                                                         
592                                                                 fileB.seekg (0, ios::beg);
593                                                                 
594                                                                 int numRead = size / 1024;
595                                                                 int lastRead = size % 1024;
596
597                                                                 for (int i = 0; i < numRead; i++) {
598                                 
599                                                                         memblock = new char [1024];
600                                                                 
601                                                                         fileB.read (memblock, 1024);
602                                                                         string temp = memblock;
603                                                                         outFile << temp.substr(0, 1024);
604                                                                         
605                                                                         delete memblock;
606                                                                 }
607                                                                 
608                                                                 memblock = new char [lastRead];
609                                                                 
610                                                                 fileB.read (memblock, lastRead);
611                                                                 
612                                                                 //not sure why but it will read more than lastRead char...??
613                                                                 string temp = memblock;
614                                                                 outFile << temp.substr(0, lastRead);
615                                                                         
616                                                                 delete memblock;
617                                                                 
618                                                                 fileB.close();
619                                                                 remove(fileName2.c_str());
620                                                                 
621                                                                 //write out the merged memory
622                                                                 if (numOutputs[groupID] > 60) {
623                                                                         outFile << outputs[groupID];
624                                                                         outputs[groupID] = "";
625                                                                         numOutputs[groupID] = 0;
626                                                                 }
627                                                                 
628                                                                 outFile.close();
629                                                                 
630                                                                 wroteOutPut[groupID] = true;
631                                                                 wroteOutPut[groupIDA] = false;
632                                                         }else { } //just merge memory
633                                                 }                                       
634                                         }
635                                 }
636                         }
637                         m->gobble(dFile);
638                 }
639                 dFile.close();
640                 
641                 for (int i = 0; i < numGroups; i++) {
642                         if (numOutputs[i] > 0) {
643                                 string fileName = distFile + "." + toString(i) + ".temp";
644                                 outFile.open(fileName.c_str(), ios::app);
645                                 outFile << outputs[i];
646                                 outFile.close();
647                         }
648                 }
649
650                 splitNames(groups);
651                                 
652                 return 0;                       
653         }
654         catch(exception& e) {
655                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
656                 exit(1);
657         }
658 }
659 //********************************************************************************************************************
660 int SplitMatrix::splitNames(vector<set<string> >& groups){
661         try {
662                 int numGroups = groups.size();
663         
664                 ifstream bigNameFile(namefile.c_str());
665                 if(!bigNameFile){
666                         cerr << "Error: We can't open the name file\n";
667                         exit(1);
668                 }
669                 
670                 map<string, string> nameMap;
671                 string name, nameList;
672                 while(bigNameFile){
673                         bigNameFile >> name >> nameList;
674                         nameMap[name] = nameList;
675                         m->gobble(bigNameFile);
676                 }
677                 bigNameFile.close();
678                         
679                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
680                         int numSeqsInGroup = groups[i].size();
681                         
682                         if(numSeqsInGroup > 0){
683                                 string fileName = namefile + "." + toString(i) + ".temp";
684                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
685                                 
686                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
687                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
688                                         if (nIt != nameMap.end()) {
689                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
690                                                 nameMap.erase(nIt);
691                                         }else{
692                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
693                                         }
694                                 }
695                                 smallNameFile.close();
696                         }
697                 }
698                 
699                 //names of singletons
700                 if (nameMap.size() != 0) {
701                         singleton = namefile + ".extra.temp";
702                         ofstream remainingNames(singleton.c_str(), ios::ate);
703                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
704                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
705                         }
706                         remainingNames.close();
707                 }else { singleton = "none"; }
708                         
709                 for(int i=0;i<numGroups;i++){
710                         if(groups[i].size() > 0){
711                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
712                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
713                                 
714                                 map<string, string> temp;
715                                 temp[tempDistFile] = tempNameFile;
716                                 dists.push_back(temp);
717                         }
718                 }
719                 
720                 if (m->control_pressed)  {  
721                         for (int i = 0; i < dists.size(); i++) { 
722                                 remove((dists[i].begin()->first).c_str());
723                                 remove((dists[i].begin()->second).c_str());
724                         }
725                         dists.clear();
726                 }
727                 
728                 return 0;
729         }
730         catch(exception& e) {
731                 m->errorOut(e, "SplitMatrix", "splitNames");
732                 exit(1);
733         }
734 }
735 //********************************************************************************************************************
736 int SplitMatrix::splitDistanceRAM(){
737         try {
738                 vector<set<string> > groups;
739                 vector<string> outputs;
740                 
741                 int numGroups = 0;
742
743                 ifstream dFile;
744                 m->openInputFile(distFile, dFile);
745
746                 while(dFile){
747                         string seqA, seqB;
748                         float dist;
749
750                         dFile >> seqA >> seqB >> dist;
751                         
752                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
753                                         
754                         if(dist < cutoff){
755                                 //cout << "in cutoff: " << dist << endl;
756                                 int groupIDA = -1;
757                                 int groupIDB = -1;
758                                 int groupID = -1;
759                                 
760                                 for(int i=0;i<numGroups;i++){
761                                         set<string>::iterator aIt = groups[i].find(seqA);
762                                         set<string>::iterator bIt = groups[i].find(seqB);
763                                         
764                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
765                                                 groups[i].insert(seqB);
766                                                 groupIDA = i;
767                                                 groupID = groupIDA;
768
769                                                 //cout << "in aIt: " << groupID << endl;
770         //                                      break;
771                                         }
772                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
773                                                 groups[i].insert(seqA);
774                                                 groupIDB = i;
775                                                 groupID = groupIDB;
776
777                                         //      cout << "in bIt: " << groupID << endl;
778         //                                      break;
779                                         }
780                                 
781                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
782                                                 if(groupIDA < groupIDB){
783                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
784                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
785                                                         groups[groupIDB].clear(); 
786                                                         groupID = groupIDA;
787                                                 }
788                                                 else{
789                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
790                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
791                                                         groups[groupIDA].clear();  
792                                                         groupID = groupIDB;
793                                                 }
794                                                 break;
795                                         }
796                                 }
797                                 
798         //windows is gonna gag on the reuse of outFile, will need to make it local...
799                                 
800                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
801                                         set<string> newGroup;
802                                         newGroup.insert(seqA);
803                                         newGroup.insert(seqB);
804                                         groups.push_back(newGroup);
805                                                                         
806                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
807                                         outputs.push_back(tempOut);
808                                         numGroups++;
809                                 }
810                                 else{
811                                                                                         
812                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
813                                         
814                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
815                                                 string row, column, distance;
816                                                 if(groupIDA<groupIDB){
817                                                         //merge memory
818                                                         outputs[groupID] += outputs[groupIDB];
819                                                         outputs[groupIDB] = "";
820                                                 }else{
821                                                         outputs[groupID] += outputs[groupIDA];
822                                                         outputs[groupIDA] = "";
823                                                 }                                       
824                                         }
825                                 }
826                         }
827                         m->gobble(dFile);
828                 }
829                 dFile.close();
830                 
831                 for (int i = 0; i < numGroups; i++) {
832                         if (outputs[i] != "") {
833                                 ofstream outFile;
834                                 string fileName = distFile + "." + toString(i) + ".temp";
835                                 outFile.open(fileName.c_str(), ios::ate);
836                                 outFile << outputs[i];
837                                 outFile.close();
838                         }
839                 }
840
841                 splitNames(groups);
842                                 
843                 return 0;                       
844         }
845         catch(exception& e) {
846                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
847                 exit(1);
848         }
849 }
850 //********************************************************************************************************************
851 //sorts biggest to smallest
852 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
853         
854         FILE * pFile;
855         long leftsize = 0;
856                 
857         //get num bytes in file
858         string filename = left.begin()->first;
859         pFile = fopen (filename.c_str(),"rb");
860         string error = "Error opening " + filename;
861         if (pFile==NULL) perror (error.c_str());
862         else{
863                 fseek (pFile, 0, SEEK_END);
864                 leftsize=ftell (pFile);
865                 fclose (pFile);
866         }
867
868         FILE * pFile2;
869         long rightsize = 0;
870                 
871         //get num bytes in file
872         filename = right.begin()->first;
873         pFile2 = fopen (filename.c_str(),"rb");
874         error = "Error opening " + filename;
875         if (pFile2==NULL) perror (error.c_str());
876         else{
877                 fseek (pFile2, 0, SEEK_END);
878                 rightsize=ftell (pFile2);
879                 fclose (pFile2);
880         }
881
882         return (leftsize > rightsize);  
883
884 /***********************************************************************/
885 //returns map of distance files -> namefile sorted by distance file size
886 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
887         try {   
888                 
889                 sort(dists.begin(), dists.end(), compareFileSizes);
890                 
891                 return dists;
892         }
893         catch(exception& e) {
894                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
895                 exit(1);
896         }
897 }
898 /***********************************************************************/
899 SplitMatrix::~SplitMatrix(){}
900 /***********************************************************************/
901