]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
1.19.0
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
14
15 /***********************************************************************/
16
17 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
18         m = MothurOut::getInstance();
19         distFile = distfile;
20         cutoff = c;
21         namefile = name;
22         method = t;
23         taxFile = tax;
24         large = l;
25 }
26 /***********************************************************************/
27
28 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, string output){
29         m = MothurOut::getInstance();
30         fastafile = ffile;
31         namefile = name;
32         taxFile = tax;
33         cutoff = c;  //tax level cutoff
34         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
35         method = t;
36         processors = p;
37         outputDir = output;
38 }
39
40 /***********************************************************************/
41
42 int SplitMatrix::split(){
43         try {
44         
45                 if (method == "distance") {  
46                         splitDistance();
47                 }else if ((method == "classify") || (method == "fasta")) {
48                         splitClassify();
49                 }else {
50                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
51                         map<string, string> temp;
52                         temp[distFile] = namefile;
53                         dists.push_back(temp);
54                 }
55                 
56                 return 0;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "SplitMatrix", "split");
60                 exit(1);
61         }
62 }
63 /***********************************************************************/
64 int SplitMatrix::splitDistance(){
65         try {
66         
67                 if (large)      { splitDistanceLarge(); }
68                 else            { splitDistanceRAM();   }
69                 
70                 return 0;
71                         
72         }
73         catch(exception& e) {
74                 m->errorOut(e, "SplitMatrix", "splitDistance");
75                 exit(1);
76         }
77 }
78
79 /***********************************************************************/
80 int SplitMatrix::splitClassify(){
81         try {
82                 cutoff = int(cutoff);
83                                 
84                 map<string, int> seqGroup;
85                 map<string, int>::iterator it;
86                 map<string, int>::iterator it2;
87                 
88                 int numGroups = 0;
89                 
90                 //build tree from users taxonomy file
91                 PhyloTree* phylo = new PhyloTree();
92                 
93                 ifstream in;
94                 m->openInputFile(taxFile, in);
95                         
96                 //read in users taxonomy file and add sequences to tree
97                 string seqname, tax;
98                 while(!in.eof()){
99                         in >> seqname >> tax; m->gobble(in);
100                         phylo->addSeqToTree(seqname, tax);
101                 }
102                 in.close();
103                 
104                 phylo->assignHeirarchyIDs(0);
105
106                 //make sure the cutoff is not greater than maxlevel
107                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
108         
109                 //for each node in tree
110                 for (int i = 0; i < phylo->getNumNodes(); i++) {
111                 
112                         //is this node within the cutoff
113                         TaxNode taxon = phylo->get(i);
114         
115                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
116                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
117                                         for (int j = 0; j < taxon.accessions.size(); j++) {
118                                                 seqGroup[taxon.accessions[j]] = numGroups;
119                                         }
120                                         numGroups++;
121                                 }
122                         }
123                 }
124         
125                 delete phylo;
126                 
127                 if (method == "classify") {
128                         splitDistanceFileByTax(seqGroup, numGroups);
129                 }else {
130                         createDistanceFilesFromTax(seqGroup, numGroups);
131                 }
132                 
133                 return 0;
134                         
135         }
136         catch(exception& e) {
137                 m->errorOut(e, "SplitMatrix", "splitClassify");
138                 exit(1);
139         }
140 }
141 /***********************************************************************/
142 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
143         try {
144                 map<string, int> copyGroups = seqGroup;
145                 map<string, int>::iterator it;
146                 set<string> names;
147                                 
148                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
149                         remove((fastafile + "." + toString(i) + ".temp").c_str());
150                 }
151                         
152                 ifstream in;
153                 m->openInputFile(fastafile, in);
154         
155                 //parse fastafile
156                 ofstream outFile;
157                 while (!in.eof()) {
158                         Sequence query(in); m->gobble(in);
159                         if (query.getName() != "") {
160                 
161                                 it = seqGroup.find(query.getName());
162                                 
163                                 //save names in case no namefile is given
164                                 if (namefile == "") {  names.insert(query.getName()); }
165                         
166                                 if (it != seqGroup.end()) { //not singleton 
167                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
168                                         query.printSequence(outFile); 
169                                         outFile.close();
170                                         
171                                         copyGroups.erase(query.getName());
172                                 }
173                         }
174                 }
175                 in.close();
176                 
177                 //warn about sequence in groups that are not in fasta file
178                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
179                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
180                         exit(1);
181                 }
182                 
183                 copyGroups.clear();
184                 
185                 //process each distance file
186                 for (int i = 0; i < numGroups; i++) { 
187                         
188                         string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff);
189                         if (outputDir != "") { options += ", outputdir=" + outputDir; }
190                         
191                         Command* command = new DistanceCommand(options);
192                         
193                         command->execute();
194                         delete command;
195                         
196                         remove((fastafile + "." + toString(i) + ".temp").c_str());
197                         
198                         //remove old names files just in case
199                         remove((namefile + "." + toString(i) + ".temp").c_str());
200                 }
201                         
202                 singleton = namefile + ".extra.temp";
203                 ofstream remainingNames;
204                 m->openOutputFile(singleton, remainingNames);
205                 
206                 bool wroteExtra = false;
207
208                 ifstream bigNameFile;
209                 m->openInputFile(namefile, bigNameFile);
210                 
211                 string name, nameList;
212                 while(!bigNameFile.eof()){
213                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
214                         
215                         //did this sequence get assigned a group
216                         it = seqGroup.find(name);
217                         
218                         if (it != seqGroup.end()) {  
219                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
220                                 outFile << name << '\t' << nameList << endl;
221                                 outFile.close();
222                         }else{
223                                 wroteExtra = true;
224                                 remainingNames << name << '\t' << nameList << endl;
225                         }
226                 }
227                 bigNameFile.close();
228                 
229                 for(int i=0;i<numGroups;i++){
230                         string tempNameFile = namefile + "." + toString(i) + ".temp";
231                         if (outputDir == "") { outputDir = m->hasPath(fastafile); }
232                         string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
233
234                         //if there are valid distances
235                         ifstream fileHandle;
236                         fileHandle.open(tempDistFile.c_str());
237                         if(fileHandle)  {       
238                                 m->gobble(fileHandle);
239                                 if (!fileHandle.eof()) {  //check for blank file - this could occur if all dists in group are above cutoff
240                                         map<string, string> temp;
241                                         temp[tempDistFile] = tempNameFile;
242                                         dists.push_back(temp);
243                                 }else {
244                                         ifstream in;
245                                         m->openInputFile(tempNameFile, in);
246                                 
247                                         while(!in.eof()) { 
248                                                 in >> name >> nameList;  m->gobble(in);
249                                                 wroteExtra = true;
250                                                 remainingNames << name << '\t' << nameList << endl;
251                                         }
252                                         in.close();
253                                         remove(tempNameFile.c_str());
254                                 }
255                         }
256                         fileHandle.close();
257                 }
258                 
259                 remainingNames.close();
260                 if (!wroteExtra) { 
261                         remove(singleton.c_str());
262                         singleton = "none";
263                 }
264
265                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
266                 
267                 return 0;
268         }
269         catch(exception& e) {
270                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
271                 exit(1);
272         }
273 }
274 /***********************************************************************/
275 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
276         try {
277                 map<string, int>::iterator it;
278                 map<string, int>::iterator it2;
279                 
280                 ifstream dFile;
281                 m->openInputFile(distFile, dFile);
282                 ofstream outFile;
283                 
284                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
285                         remove((distFile + "." + toString(i) + ".temp").c_str());
286                 }
287                 
288                 //for buffering the io to improve speed
289                  //allow for 10 dists to be stored, then output.
290                 vector<string> outputs;  outputs.resize(numGroups, "");
291                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
292                 
293                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
294                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
295                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
296                 
297                 //for each distance
298                 while(dFile){
299                         string seqA, seqB;
300                         float dist;
301                         
302                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
303                         
304                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
305                         
306                         //if both sequences are in the same group then they are within the cutoff
307                         it = seqGroup.find(seqA);
308                         it2 = seqGroup.find(seqB);
309                         
310                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
311                                 if (it->second == it2->second) { //they are from the same group so add the distance
312                                         if (numOutputs[it->second] > 30) {
313                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
314                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
315                                                 outFile.close();
316                                                 outputs[it->second] = "";
317                                                 numOutputs[it->second] = 0;
318                                                 validDistances[it->second] = true;
319                                         }else{
320                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
321                                                 numOutputs[it->second]++;
322                                         }
323                                 }
324                         }
325                 }
326                 dFile.close();
327         
328                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
329                         remove((namefile + "." + toString(i) + ".temp").c_str());
330                         
331                         //write out any remaining buffers
332                         if (numOutputs[i] > 0) {
333                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
334                                 outFile << outputs[i];
335                                 outFile.close();
336                                 outputs[i] = "";
337                                 numOutputs[i] = 0;
338                                 validDistances[i] = true;
339                         }
340                 }
341                 
342                 ifstream bigNameFile;
343                 m->openInputFile(namefile, bigNameFile);
344                 
345                 singleton = namefile + ".extra.temp";
346                 ofstream remainingNames;
347                 m->openOutputFile(singleton, remainingNames);
348                 
349                 bool wroteExtra = false;
350                                                 
351                 string name, nameList;
352                 while(!bigNameFile.eof()){
353                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
354                         
355                         //did this sequence get assigned a group
356                         it = seqGroup.find(name);
357                         
358                         if (it != seqGroup.end()) {  
359                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
360                                 outFile << name << '\t' << nameList << endl;
361                                 outFile.close();
362                         }else{
363                                 wroteExtra = true;
364                                 remainingNames << name << '\t' << nameList << endl;
365                         }
366                 }
367                 bigNameFile.close();
368                                 
369                 for(int i=0;i<numGroups;i++){
370                         string tempNameFile = namefile + "." + toString(i) + ".temp";
371                         string tempDistFile = distFile + "." + toString(i) + ".temp";
372
373                         //if there are valid distances
374                         if (validDistances[i]) {
375                                 map<string, string> temp;
376                                 temp[tempDistFile] = tempNameFile;
377                                 dists.push_back(temp);
378                         }else{
379                                 ifstream in;
380                                 m->openInputFile(tempNameFile, in);
381                                 
382                                 while(!in.eof()) { 
383                                         in >> name >> nameList;  m->gobble(in);
384                                         wroteExtra = true;
385                                         remainingNames << name << '\t' << nameList << endl;
386                                 }
387                                 in.close();
388                                 remove(tempNameFile.c_str());
389                         }
390                 }
391                 
392                 remainingNames.close();
393                 
394                 if (!wroteExtra) { 
395                         remove(singleton.c_str());
396                         singleton = "none";
397                 }
398
399                 if (m->control_pressed)  {  
400                         for (int i = 0; i < dists.size(); i++) { 
401                                 remove((dists[i].begin()->first).c_str());
402                                 remove((dists[i].begin()->second).c_str());
403                         }
404                         dists.clear();
405                 }
406                 
407                 return 0;
408         }
409         catch(exception& e) {
410                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
411                 exit(1);
412         }
413 }
414 /***********************************************************************/
415 int SplitMatrix::splitDistanceLarge(){
416         try {
417                 vector<set<string> > groups;
418                 
419                 //for buffering the io to improve speed
420                  //allow for 30 dists to be stored, then output.
421                 vector<string> outputs;
422                 vector<int> numOutputs;
423                 vector<bool> wroteOutPut;
424                 
425                 int numGroups = 0;
426
427                 ofstream outFile;
428                 ifstream dFile;
429                 m->openInputFile(distFile, dFile);
430         
431                 while(dFile){
432                         string seqA, seqB;
433                         float dist;
434
435                         dFile >> seqA >> seqB >> dist;
436                         
437                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
438                                         
439                         if(dist < cutoff){
440                                 //cout << "in cutoff: " << dist << endl;
441                                 int groupIDA = -1;
442                                 int groupIDB = -1;
443                                 int groupID = -1;
444                                 
445                                 for(int i=0;i<numGroups;i++){
446                                         set<string>::iterator aIt = groups[i].find(seqA);
447                                         set<string>::iterator bIt = groups[i].find(seqB);
448                                         
449                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
450                                                 groups[i].insert(seqB);
451                                                 groupIDA = i;
452                                                 groupID = groupIDA;
453
454                                                 //cout << "in aIt: " << groupID << endl;
455         //                                      break;
456                                         }
457                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
458                                                 groups[i].insert(seqA);
459                                                 groupIDB = i;
460                                                 groupID = groupIDB;
461
462                                         //      cout << "in bIt: " << groupID << endl;
463         //                                      break;
464                                         }
465                                 
466                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
467                                                 if(groupIDA < groupIDB){
468                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
469                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
470                                                         groups[groupIDB].clear(); 
471                                                         groupID = groupIDA;
472                                                 }
473                                                 else{
474                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
475                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
476                                                         groups[groupIDA].clear();  
477                                                         groupID = groupIDB;
478                                                 }
479                                                 break;
480                                         }
481                                 }
482                                 
483         //windows is gonna gag on the reuse of outFile, will need to make it local...
484                                 
485                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
486                                         set<string> newGroup;
487                                         newGroup.insert(seqA);
488                                         newGroup.insert(seqB);
489                                         groups.push_back(newGroup);
490                                                                         
491                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
492                                         outputs.push_back(tempOut);
493                                         numOutputs.push_back(1);
494                                         wroteOutPut.push_back(false);
495                                         
496                                         numGroups++;
497                                 }
498                                 else{
499                                         string fileName = distFile + "." + toString(groupID) + ".temp";
500                                                                                         
501                                         //have we reached the max buffer size
502                                         if (numOutputs[groupID] > 60) { //write out sequence
503                                                 outFile.open(fileName.c_str(), ios::app);
504                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
505                                                 outFile.close();
506                                                 
507                                                 outputs[groupID] = "";
508                                                 numOutputs[groupID] = 0;
509                                                 wroteOutPut[groupID] = true;
510                                         }else {
511                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
512                                                 numOutputs[groupID]++;
513                                         }
514                                         
515                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
516                                                 string row, column, distance;
517                                                 if(groupIDA<groupIDB){
518                                                         
519                                                         //merge memory
520                                                         numOutputs[groupID] += numOutputs[groupIDB];
521                                                         outputs[groupID] += outputs[groupIDB];
522                                                         
523                                                         outputs[groupIDB] = "";
524                                                         numOutputs[groupIDB] = 0;
525                                                         
526                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
527                                                         if (wroteOutPut[groupIDB]) {
528                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
529                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
530                                                                 
531                                                                 outFile.open(fileName.c_str(), ios::app);
532                                                                 
533                                                                 long size;
534                                                                 char* memblock;
535
536                                                                 size = fileB.tellg();
537                                 
538                                                                 fileB.seekg (0, ios::beg);
539                                                                 
540                                                                 int numRead = size / 1024;
541                                                                 int lastRead = size % 1024;
542
543                                                                 for (int i = 0; i < numRead; i++) {
544                                 
545                                                                         memblock = new char [1024];
546                                                                 
547                                                                         fileB.read (memblock, 1024);
548                                                                         
549                                                                         string temp = memblock;
550                                                                         outFile << temp.substr(0, 1024);
551                                                                         
552                                                                         delete memblock;
553                                                                 }
554                                                                 
555                                                                 memblock = new char [lastRead];
556                                                                 
557                                                                 fileB.read (memblock, lastRead);
558                                                                 
559                                                                 //not sure why but it will read more than lastRead char...??
560                                                                 string temp = memblock;
561                                                                 outFile << temp.substr(0, lastRead);
562                                                                 delete memblock;
563                                                                 
564                                                                 fileB.close();
565                                                                 remove(fileName2.c_str());
566                                                                 
567                                                                 //write out the merged memory
568                                                                 if (numOutputs[groupID] > 60) {
569                                                                         outFile << outputs[groupID];
570                                                                         outputs[groupID] = "";
571                                                                         numOutputs[groupID] = 0;
572                                                                 }
573                                                                 
574                                                                 outFile.close();
575                                                                 
576                                                                 wroteOutPut[groupID] = true;
577                                                                 wroteOutPut[groupIDB] = false;
578                                                         }else{ } //just merge b's memory with a's memory 
579                                                 }
580                                                 else{
581                                                         numOutputs[groupID] += numOutputs[groupIDA];
582                                                         outputs[groupID] += outputs[groupIDA];
583                                                         
584                                                         outputs[groupIDA] = "";
585                                                         numOutputs[groupIDA] = 0;
586                                                         
587                                                         if (wroteOutPut[groupIDA]) {
588                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
589                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
590                                                                 
591                                                                 outFile.open(fileName.c_str(), ios::app);
592                                                                 
593                                                                 long size;
594                                                                 char* memblock;
595
596                                                                 size = fileB.tellg();
597                                                                                                                         
598                                                                 fileB.seekg (0, ios::beg);
599                                                                 
600                                                                 int numRead = size / 1024;
601                                                                 int lastRead = size % 1024;
602
603                                                                 for (int i = 0; i < numRead; i++) {
604                                 
605                                                                         memblock = new char [1024];
606                                                                 
607                                                                         fileB.read (memblock, 1024);
608                                                                         string temp = memblock;
609                                                                         outFile << temp.substr(0, 1024);
610                                                                         
611                                                                         delete memblock;
612                                                                 }
613                                                                 
614                                                                 memblock = new char [lastRead];
615                                                                 
616                                                                 fileB.read (memblock, lastRead);
617                                                                 
618                                                                 //not sure why but it will read more than lastRead char...??
619                                                                 string temp = memblock;
620                                                                 outFile << temp.substr(0, lastRead);
621                                                                         
622                                                                 delete memblock;
623                                                                 
624                                                                 fileB.close();
625                                                                 remove(fileName2.c_str());
626                                                                 
627                                                                 //write out the merged memory
628                                                                 if (numOutputs[groupID] > 60) {
629                                                                         outFile << outputs[groupID];
630                                                                         outputs[groupID] = "";
631                                                                         numOutputs[groupID] = 0;
632                                                                 }
633                                                                 
634                                                                 outFile.close();
635                                                                 
636                                                                 wroteOutPut[groupID] = true;
637                                                                 wroteOutPut[groupIDA] = false;
638                                                         }else { } //just merge memory
639                                                 }                                       
640                                         }
641                                 }
642                         }
643                         m->gobble(dFile);
644                 }
645                 dFile.close();
646                 
647                 for (int i = 0; i < numGroups; i++) {
648                         if (numOutputs[i] > 0) {
649                                 string fileName = distFile + "." + toString(i) + ".temp";
650                                 outFile.open(fileName.c_str(), ios::app);
651                                 outFile << outputs[i];
652                                 outFile.close();
653                         }
654                 }
655
656                 splitNames(groups);
657                                 
658                 return 0;                       
659         }
660         catch(exception& e) {
661                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
662                 exit(1);
663         }
664 }
665 //********************************************************************************************************************
666 int SplitMatrix::splitNames(vector<set<string> >& groups){
667         try {
668                 int numGroups = groups.size();
669         
670                 ifstream bigNameFile(namefile.c_str());
671                 if(!bigNameFile){
672                         cerr << "Error: We can't open the name file\n";
673                         exit(1);
674                 }
675                 
676                 map<string, string> nameMap;
677                 string name, nameList;
678                 while(bigNameFile){
679                         bigNameFile >> name >> nameList;
680                         nameMap[name] = nameList;
681                         m->gobble(bigNameFile);
682                 }
683                 bigNameFile.close();
684                         
685                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
686                         int numSeqsInGroup = groups[i].size();
687                         
688                         if(numSeqsInGroup > 0){
689                                 string fileName = namefile + "." + toString(i) + ".temp";
690                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
691                                 
692                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
693                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
694                                         if (nIt != nameMap.end()) {
695                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
696                                                 nameMap.erase(nIt);
697                                         }else{
698                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
699                                         }
700                                 }
701                                 smallNameFile.close();
702                         }
703                 }
704                 
705                 //names of singletons
706                 if (nameMap.size() != 0) {
707                         singleton = namefile + ".extra.temp";
708                         ofstream remainingNames(singleton.c_str(), ios::ate);
709                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
710                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
711                         }
712                         remainingNames.close();
713                 }else { singleton = "none"; }
714                         
715                 for(int i=0;i<numGroups;i++){
716                         if(groups[i].size() > 0){
717                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
718                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
719                                 
720                                 map<string, string> temp;
721                                 temp[tempDistFile] = tempNameFile;
722                                 dists.push_back(temp);
723                         }
724                 }
725                 
726                 if (m->control_pressed)  {  
727                         for (int i = 0; i < dists.size(); i++) { 
728                                 remove((dists[i].begin()->first).c_str());
729                                 remove((dists[i].begin()->second).c_str());
730                         }
731                         dists.clear();
732                 }
733                 
734                 return 0;
735         }
736         catch(exception& e) {
737                 m->errorOut(e, "SplitMatrix", "splitNames");
738                 exit(1);
739         }
740 }
741 //********************************************************************************************************************
742 int SplitMatrix::splitDistanceRAM(){
743         try {
744                 vector<set<string> > groups;
745                 vector<string> outputs;
746                 
747                 int numGroups = 0;
748
749                 ifstream dFile;
750                 m->openInputFile(distFile, dFile);
751
752                 while(dFile){
753                         string seqA, seqB;
754                         float dist;
755
756                         dFile >> seqA >> seqB >> dist;
757                         
758                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
759                                         
760                         if(dist < cutoff){
761                                 //cout << "in cutoff: " << dist << endl;
762                                 int groupIDA = -1;
763                                 int groupIDB = -1;
764                                 int groupID = -1;
765                                 
766                                 for(int i=0;i<numGroups;i++){
767                                         set<string>::iterator aIt = groups[i].find(seqA);
768                                         set<string>::iterator bIt = groups[i].find(seqB);
769                                         
770                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
771                                                 groups[i].insert(seqB);
772                                                 groupIDA = i;
773                                                 groupID = groupIDA;
774
775                                                 //cout << "in aIt: " << groupID << endl;
776         //                                      break;
777                                         }
778                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
779                                                 groups[i].insert(seqA);
780                                                 groupIDB = i;
781                                                 groupID = groupIDB;
782
783                                         //      cout << "in bIt: " << groupID << endl;
784         //                                      break;
785                                         }
786                                 
787                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
788                                                 if(groupIDA < groupIDB){
789                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
790                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
791                                                         groups[groupIDB].clear(); 
792                                                         groupID = groupIDA;
793                                                 }
794                                                 else{
795                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
796                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
797                                                         groups[groupIDA].clear();  
798                                                         groupID = groupIDB;
799                                                 }
800                                                 break;
801                                         }
802                                 }
803                                 
804         //windows is gonna gag on the reuse of outFile, will need to make it local...
805                                 
806                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
807                                         set<string> newGroup;
808                                         newGroup.insert(seqA);
809                                         newGroup.insert(seqB);
810                                         groups.push_back(newGroup);
811                                                                         
812                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
813                                         outputs.push_back(tempOut);
814                                         numGroups++;
815                                 }
816                                 else{
817                                                                                         
818                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
819                                         
820                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
821                                                 string row, column, distance;
822                                                 if(groupIDA<groupIDB){
823                                                         //merge memory
824                                                         outputs[groupID] += outputs[groupIDB];
825                                                         outputs[groupIDB] = "";
826                                                 }else{
827                                                         outputs[groupID] += outputs[groupIDA];
828                                                         outputs[groupIDA] = "";
829                                                 }                                       
830                                         }
831                                 }
832                         }
833                         m->gobble(dFile);
834                 }
835                 dFile.close();
836                 
837                 for (int i = 0; i < numGroups; i++) {
838                         if (outputs[i] != "") {
839                                 ofstream outFile;
840                                 string fileName = distFile + "." + toString(i) + ".temp";
841                                 outFile.open(fileName.c_str(), ios::ate);
842                                 outFile << outputs[i];
843                                 outFile.close();
844                         }
845                 }
846
847                 splitNames(groups);
848                                 
849                 return 0;                       
850         }
851         catch(exception& e) {
852                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
853                 exit(1);
854         }
855 }
856 //********************************************************************************************************************
857 //sorts biggest to smallest
858 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
859         
860         FILE * pFile;
861         long leftsize = 0;
862                 
863         //get num bytes in file
864         string filename = left.begin()->first;
865         pFile = fopen (filename.c_str(),"rb");
866         string error = "Error opening " + filename;
867         if (pFile==NULL) perror (error.c_str());
868         else{
869                 fseek (pFile, 0, SEEK_END);
870                 leftsize=ftell (pFile);
871                 fclose (pFile);
872         }
873
874         FILE * pFile2;
875         long rightsize = 0;
876                 
877         //get num bytes in file
878         filename = right.begin()->first;
879         pFile2 = fopen (filename.c_str(),"rb");
880         error = "Error opening " + filename;
881         if (pFile2==NULL) perror (error.c_str());
882         else{
883                 fseek (pFile2, 0, SEEK_END);
884                 rightsize=ftell (pFile2);
885                 fclose (pFile2);
886         }
887
888         return (leftsize > rightsize);  
889
890 /***********************************************************************/
891 //returns map of distance files -> namefile sorted by distance file size
892 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
893         try {   
894                 
895                 sort(dists.begin(), dists.end(), compareFileSizes);
896                 
897                 return dists;
898         }
899         catch(exception& e) {
900                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
901                 exit(1);
902         }
903 }
904 /***********************************************************************/
905 SplitMatrix::~SplitMatrix(){}
906 /***********************************************************************/
907