]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
removed various build warnings
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
14
15 /***********************************************************************/
16
17 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
18         m = MothurOut::getInstance();
19         distFile = distfile;
20         cutoff = c;
21         namefile = name;
22         method = t;
23         taxFile = tax;
24         large = l;
25 }
26 /***********************************************************************/
27
28 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, string output){
29         m = MothurOut::getInstance();
30         fastafile = ffile;
31         namefile = name;
32         taxFile = tax;
33         cutoff = c;  //tax level cutoff
34         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
35         method = t;
36         processors = p;
37         outputDir = output;
38 }
39
40 /***********************************************************************/
41
42 int SplitMatrix::split(){
43         try {
44         
45                 if (method == "distance") {  
46                         splitDistance();
47                 }else if ((method == "classify") || (method == "fasta")) {
48                         splitClassify();
49                 }else {
50                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
51                         map<string, string> temp;
52                         temp[distFile] = namefile;
53                         dists.push_back(temp);
54                 }
55                 
56                 return 0;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "SplitMatrix", "split");
60                 exit(1);
61         }
62 }
63 /***********************************************************************/
64 int SplitMatrix::splitDistance(){
65         try {
66         
67                 if (large)      { splitDistanceLarge(); }
68                 else            { splitDistanceRAM();   }
69                 
70                 return 0;
71                         
72         }
73         catch(exception& e) {
74                 m->errorOut(e, "SplitMatrix", "splitDistance");
75                 exit(1);
76         }
77 }
78
79 /***********************************************************************/
80 int SplitMatrix::splitClassify(){
81         try {
82                 cutoff = int(cutoff);
83                                 
84                 map<string, int> seqGroup;
85                 map<string, int>::iterator it;
86                 map<string, int>::iterator it2;
87                 
88                 int numGroups = 0;
89                 
90                 //build tree from users taxonomy file
91                 PhyloTree* phylo = new PhyloTree();
92                 
93                 ifstream in;
94                 m->openInputFile(taxFile, in);
95                         
96                 //read in users taxonomy file and add sequences to tree
97                 string seqname, tax;
98                 while(!in.eof()){
99                         in >> seqname >> tax; m->gobble(in);
100                         phylo->addSeqToTree(seqname, tax);
101                 }
102                 in.close();
103                 
104                 phylo->assignHeirarchyIDs(0);
105
106                 //make sure the cutoff is not greater than maxlevel
107                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
108         
109                 //for each node in tree
110                 for (int i = 0; i < phylo->getNumNodes(); i++) {
111                 
112                         //is this node within the cutoff
113                         TaxNode taxon = phylo->get(i);
114         
115                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
116                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
117                                         for (int j = 0; j < taxon.accessions.size(); j++) {
118                                                 seqGroup[taxon.accessions[j]] = numGroups;
119                                         }
120                                         numGroups++;
121                                 }
122                         }
123                 }
124         
125                 delete phylo;
126                 
127                 if (method == "classify") {
128                         splitDistanceFileByTax(seqGroup, numGroups);
129                 }else {
130                         createDistanceFilesFromTax(seqGroup, numGroups);
131                 }
132                 
133                 return 0;
134                         
135         }
136         catch(exception& e) {
137                 m->errorOut(e, "SplitMatrix", "splitClassify");
138                 exit(1);
139         }
140 }
141 /***********************************************************************/
142 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
143         try {
144                 map<string, int> copyGroups = seqGroup;
145                 map<string, int>::iterator it;
146                 set<string> names;
147                                 
148                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
149                         remove((fastafile + "." + toString(i) + ".temp").c_str());
150                 }
151                         
152                 ifstream in;
153                 m->openInputFile(fastafile, in);
154         
155                 //parse fastafile
156                 ofstream outFile;
157                 while (!in.eof()) {
158                         Sequence query(in); m->gobble(in);
159                         if (query.getName() != "") {
160                 
161                                 it = seqGroup.find(query.getName());
162                                 
163                                 //save names in case no namefile is given
164                                 if (namefile == "") {  names.insert(query.getName()); }
165                         
166                                 if (it != seqGroup.end()) { //not singleton 
167                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
168                                         query.printSequence(outFile); 
169                                         outFile.close();
170                                         
171                                         copyGroups.erase(query.getName());
172                                 }
173                         }
174                 }
175                 in.close();
176                 
177                 //warn about sequence in groups that are not in fasta file
178                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
179                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
180                         exit(1);
181                 }
182                 
183                 copyGroups.clear();
184                 
185                 //process each distance file
186                 for (int i = 0; i < numGroups; i++) { 
187                         
188                         string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff);
189                         
190                         Command* command = new DistanceCommand(options);
191                         
192                         command->execute();
193                         delete command;
194                         
195                         remove((fastafile + "." + toString(i) + ".temp").c_str());
196                         
197                         //remove old names files just in case
198                         remove((namefile + "." + toString(i) + ".temp").c_str());
199                 }
200                         
201                 singleton = namefile + ".extra.temp";
202                 ofstream remainingNames;
203                 m->openOutputFile(singleton, remainingNames);
204                 
205                 bool wroteExtra = false;
206
207                 ifstream bigNameFile;
208                 m->openInputFile(namefile, bigNameFile);
209                 
210                 string name, nameList;
211                 while(!bigNameFile.eof()){
212                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
213                         
214                         //did this sequence get assigned a group
215                         it = seqGroup.find(name);
216                         
217                         if (it != seqGroup.end()) {  
218                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
219                                 outFile << name << '\t' << nameList << endl;
220                                 outFile.close();
221                         }else{
222                                 wroteExtra = true;
223                                 remainingNames << name << '\t' << nameList << endl;
224                         }
225                 }
226                 bigNameFile.close();
227                 
228                 for(int i=0;i<numGroups;i++){
229                         string tempNameFile = namefile + "." + toString(i) + ".temp";
230                         if (outputDir == "") { outputDir = m->hasPath(fastafile); }
231                         string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
232
233                         //if there are valid distances
234                         ifstream fileHandle;
235                         fileHandle.open(tempDistFile.c_str());
236                         if(fileHandle)  {       
237                                 m->gobble(fileHandle);
238                                 if (!fileHandle.eof()) {  //check for blank file - this could occur if all dists in group are above cutoff
239                                         map<string, string> temp;
240                                         temp[tempDistFile] = tempNameFile;
241                                         dists.push_back(temp);
242                                 }else {
243                                         ifstream in;
244                                         m->openInputFile(tempNameFile, in);
245                                 
246                                         while(!in.eof()) { 
247                                                 in >> name >> nameList;  m->gobble(in);
248                                                 wroteExtra = true;
249                                                 remainingNames << name << '\t' << nameList << endl;
250                                         }
251                                         in.close();
252                                         remove(tempNameFile.c_str());
253                                 }
254                         }
255                         fileHandle.close();
256                 }
257                 
258                 remainingNames.close();
259                 if (!wroteExtra) { 
260                         remove(singleton.c_str());
261                         singleton = "none";
262                 }
263
264                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
265                 
266                 return 0;
267         }
268         catch(exception& e) {
269                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
270                 exit(1);
271         }
272 }
273 /***********************************************************************/
274 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
275         try {
276                 map<string, int>::iterator it;
277                 map<string, int>::iterator it2;
278                 
279                 ifstream dFile;
280                 m->openInputFile(distFile, dFile);
281                 ofstream outFile;
282                 
283                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
284                         remove((distFile + "." + toString(i) + ".temp").c_str());
285                 }
286                 
287                 //for buffering the io to improve speed
288                  //allow for 10 dists to be stored, then output.
289                 vector<string> outputs;  outputs.resize(numGroups, "");
290                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
291                 
292                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
293                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
294                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
295                 
296                 //for each distance
297                 while(dFile){
298                         string seqA, seqB;
299                         float dist;
300                         
301                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
302                         
303                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
304                         
305                         //if both sequences are in the same group then they are within the cutoff
306                         it = seqGroup.find(seqA);
307                         it2 = seqGroup.find(seqB);
308                         
309                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
310                                 if (it->second == it2->second) { //they are from the same group so add the distance
311                                         if (numOutputs[it->second] > 30) {
312                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
313                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
314                                                 outFile.close();
315                                                 outputs[it->second] = "";
316                                                 numOutputs[it->second] = 0;
317                                                 validDistances[it->second] = true;
318                                         }else{
319                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
320                                                 numOutputs[it->second]++;
321                                         }
322                                 }
323                         }
324                 }
325                 dFile.close();
326         
327                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
328                         remove((namefile + "." + toString(i) + ".temp").c_str());
329                         
330                         //write out any remaining buffers
331                         if (numOutputs[i] > 0) {
332                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
333                                 outFile << outputs[i];
334                                 outFile.close();
335                                 outputs[i] = "";
336                                 numOutputs[i] = 0;
337                                 validDistances[i] = true;
338                         }
339                 }
340                 
341                 ifstream bigNameFile;
342                 m->openInputFile(namefile, bigNameFile);
343                 
344                 singleton = namefile + ".extra.temp";
345                 ofstream remainingNames;
346                 m->openOutputFile(singleton, remainingNames);
347                 
348                 bool wroteExtra = false;
349                                                 
350                 string name, nameList;
351                 while(!bigNameFile.eof()){
352                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
353                         
354                         //did this sequence get assigned a group
355                         it = seqGroup.find(name);
356                         
357                         if (it != seqGroup.end()) {  
358                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
359                                 outFile << name << '\t' << nameList << endl;
360                                 outFile.close();
361                         }else{
362                                 wroteExtra = true;
363                                 remainingNames << name << '\t' << nameList << endl;
364                         }
365                 }
366                 bigNameFile.close();
367                                 
368                 for(int i=0;i<numGroups;i++){
369                         string tempNameFile = namefile + "." + toString(i) + ".temp";
370                         string tempDistFile = distFile + "." + toString(i) + ".temp";
371
372                         //if there are valid distances
373                         if (validDistances[i]) {
374                                 map<string, string> temp;
375                                 temp[tempDistFile] = tempNameFile;
376                                 dists.push_back(temp);
377                         }else{
378                                 ifstream in;
379                                 m->openInputFile(tempNameFile, in);
380                                 
381                                 while(!in.eof()) { 
382                                         in >> name >> nameList;  m->gobble(in);
383                                         wroteExtra = true;
384                                         remainingNames << name << '\t' << nameList << endl;
385                                 }
386                                 in.close();
387                                 remove(tempNameFile.c_str());
388                         }
389                 }
390                 
391                 remainingNames.close();
392                 
393                 if (!wroteExtra) { 
394                         remove(singleton.c_str());
395                         singleton = "none";
396                 }
397
398                 if (m->control_pressed)  {  
399                         for (int i = 0; i < dists.size(); i++) { 
400                                 remove((dists[i].begin()->first).c_str());
401                                 remove((dists[i].begin()->second).c_str());
402                         }
403                         dists.clear();
404                 }
405                 
406                 return 0;
407         }
408         catch(exception& e) {
409                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
410                 exit(1);
411         }
412 }
413 /***********************************************************************/
414 int SplitMatrix::splitDistanceLarge(){
415         try {
416                 vector<set<string> > groups;
417                 
418                 //for buffering the io to improve speed
419                  //allow for 30 dists to be stored, then output.
420                 vector<string> outputs;
421                 vector<int> numOutputs;
422                 vector<bool> wroteOutPut;
423                 
424                 int numGroups = 0;
425
426                 ofstream outFile;
427                 ifstream dFile;
428                 m->openInputFile(distFile, dFile);
429         
430                 while(dFile){
431                         string seqA, seqB;
432                         float dist;
433
434                         dFile >> seqA >> seqB >> dist;
435                         
436                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
437                                         
438                         if(dist < cutoff){
439                                 //cout << "in cutoff: " << dist << endl;
440                                 int groupIDA = -1;
441                                 int groupIDB = -1;
442                                 int groupID = -1;
443                                 
444                                 for(int i=0;i<numGroups;i++){
445                                         set<string>::iterator aIt = groups[i].find(seqA);
446                                         set<string>::iterator bIt = groups[i].find(seqB);
447                                         
448                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
449                                                 groups[i].insert(seqB);
450                                                 groupIDA = i;
451                                                 groupID = groupIDA;
452
453                                                 //cout << "in aIt: " << groupID << endl;
454         //                                      break;
455                                         }
456                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
457                                                 groups[i].insert(seqA);
458                                                 groupIDB = i;
459                                                 groupID = groupIDB;
460
461                                         //      cout << "in bIt: " << groupID << endl;
462         //                                      break;
463                                         }
464                                 
465                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
466                                                 if(groupIDA < groupIDB){
467                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
468                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
469                                                         groups[groupIDB].clear(); 
470                                                         groupID = groupIDA;
471                                                 }
472                                                 else{
473                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
474                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
475                                                         groups[groupIDA].clear();  
476                                                         groupID = groupIDB;
477                                                 }
478                                                 break;
479                                         }
480                                 }
481                                 
482         //windows is gonna gag on the reuse of outFile, will need to make it local...
483                                 
484                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
485                                         set<string> newGroup;
486                                         newGroup.insert(seqA);
487                                         newGroup.insert(seqB);
488                                         groups.push_back(newGroup);
489                                                                         
490                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
491                                         outputs.push_back(tempOut);
492                                         numOutputs.push_back(1);
493                                         wroteOutPut.push_back(false);
494                                         
495                                         numGroups++;
496                                 }
497                                 else{
498                                         string fileName = distFile + "." + toString(groupID) + ".temp";
499                                                                                         
500                                         //have we reached the max buffer size
501                                         if (numOutputs[groupID] > 60) { //write out sequence
502                                                 outFile.open(fileName.c_str(), ios::app);
503                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
504                                                 outFile.close();
505                                                 
506                                                 outputs[groupID] = "";
507                                                 numOutputs[groupID] = 0;
508                                                 wroteOutPut[groupID] = true;
509                                         }else {
510                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
511                                                 numOutputs[groupID]++;
512                                         }
513                                         
514                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
515                                                 string row, column, distance;
516                                                 if(groupIDA<groupIDB){
517                                                         
518                                                         //merge memory
519                                                         numOutputs[groupID] += numOutputs[groupIDB];
520                                                         outputs[groupID] += outputs[groupIDB];
521                                                         
522                                                         outputs[groupIDB] = "";
523                                                         numOutputs[groupIDB] = 0;
524                                                         
525                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
526                                                         if (wroteOutPut[groupIDB]) {
527                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
528                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
529                                                                 
530                                                                 outFile.open(fileName.c_str(), ios::app);
531                                                                 
532                                                                 long size;
533                                                                 char* memblock;
534
535                                                                 size = fileB.tellg();
536                                 
537                                                                 fileB.seekg (0, ios::beg);
538                                                                 
539                                                                 int numRead = size / 1024;
540                                                                 int lastRead = size % 1024;
541
542                                                                 for (int i = 0; i < numRead; i++) {
543                                 
544                                                                         memblock = new char [1024];
545                                                                 
546                                                                         fileB.read (memblock, 1024);
547                                                                         
548                                                                         string temp = memblock;
549                                                                         outFile << temp.substr(0, 1024);
550                                                                         
551                                                                         delete memblock;
552                                                                 }
553                                                                 
554                                                                 memblock = new char [lastRead];
555                                                                 
556                                                                 fileB.read (memblock, lastRead);
557                                                                 
558                                                                 //not sure why but it will read more than lastRead char...??
559                                                                 string temp = memblock;
560                                                                 outFile << temp.substr(0, lastRead);
561                                                                 delete memblock;
562                                                                 
563                                                                 fileB.close();
564                                                                 remove(fileName2.c_str());
565                                                                 
566                                                                 //write out the merged memory
567                                                                 if (numOutputs[groupID] > 60) {
568                                                                         outFile << outputs[groupID];
569                                                                         outputs[groupID] = "";
570                                                                         numOutputs[groupID] = 0;
571                                                                 }
572                                                                 
573                                                                 outFile.close();
574                                                                 
575                                                                 wroteOutPut[groupID] = true;
576                                                                 wroteOutPut[groupIDB] = false;
577                                                         }else{ } //just merge b's memory with a's memory 
578                                                 }
579                                                 else{
580                                                         numOutputs[groupID] += numOutputs[groupIDA];
581                                                         outputs[groupID] += outputs[groupIDA];
582                                                         
583                                                         outputs[groupIDA] = "";
584                                                         numOutputs[groupIDA] = 0;
585                                                         
586                                                         if (wroteOutPut[groupIDA]) {
587                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
588                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
589                                                                 
590                                                                 outFile.open(fileName.c_str(), ios::app);
591                                                                 
592                                                                 long size;
593                                                                 char* memblock;
594
595                                                                 size = fileB.tellg();
596                                                                                                                         
597                                                                 fileB.seekg (0, ios::beg);
598                                                                 
599                                                                 int numRead = size / 1024;
600                                                                 int lastRead = size % 1024;
601
602                                                                 for (int i = 0; i < numRead; i++) {
603                                 
604                                                                         memblock = new char [1024];
605                                                                 
606                                                                         fileB.read (memblock, 1024);
607                                                                         string temp = memblock;
608                                                                         outFile << temp.substr(0, 1024);
609                                                                         
610                                                                         delete memblock;
611                                                                 }
612                                                                 
613                                                                 memblock = new char [lastRead];
614                                                                 
615                                                                 fileB.read (memblock, lastRead);
616                                                                 
617                                                                 //not sure why but it will read more than lastRead char...??
618                                                                 string temp = memblock;
619                                                                 outFile << temp.substr(0, lastRead);
620                                                                         
621                                                                 delete memblock;
622                                                                 
623                                                                 fileB.close();
624                                                                 remove(fileName2.c_str());
625                                                                 
626                                                                 //write out the merged memory
627                                                                 if (numOutputs[groupID] > 60) {
628                                                                         outFile << outputs[groupID];
629                                                                         outputs[groupID] = "";
630                                                                         numOutputs[groupID] = 0;
631                                                                 }
632                                                                 
633                                                                 outFile.close();
634                                                                 
635                                                                 wroteOutPut[groupID] = true;
636                                                                 wroteOutPut[groupIDA] = false;
637                                                         }else { } //just merge memory
638                                                 }                                       
639                                         }
640                                 }
641                         }
642                         m->gobble(dFile);
643                 }
644                 dFile.close();
645                 
646                 for (int i = 0; i < numGroups; i++) {
647                         if (numOutputs[i] > 0) {
648                                 string fileName = distFile + "." + toString(i) + ".temp";
649                                 outFile.open(fileName.c_str(), ios::app);
650                                 outFile << outputs[i];
651                                 outFile.close();
652                         }
653                 }
654
655                 splitNames(groups);
656                                 
657                 return 0;                       
658         }
659         catch(exception& e) {
660                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
661                 exit(1);
662         }
663 }
664 //********************************************************************************************************************
665 int SplitMatrix::splitNames(vector<set<string> >& groups){
666         try {
667                 int numGroups = groups.size();
668         
669                 ifstream bigNameFile(namefile.c_str());
670                 if(!bigNameFile){
671                         cerr << "Error: We can't open the name file\n";
672                         exit(1);
673                 }
674                 
675                 map<string, string> nameMap;
676                 string name, nameList;
677                 while(bigNameFile){
678                         bigNameFile >> name >> nameList;
679                         nameMap[name] = nameList;
680                         m->gobble(bigNameFile);
681                 }
682                 bigNameFile.close();
683                         
684                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
685                         int numSeqsInGroup = groups[i].size();
686                         
687                         if(numSeqsInGroup > 0){
688                                 string fileName = namefile + "." + toString(i) + ".temp";
689                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
690                                 
691                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
692                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
693                                         if (nIt != nameMap.end()) {
694                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
695                                                 nameMap.erase(nIt);
696                                         }else{
697                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
698                                         }
699                                 }
700                                 smallNameFile.close();
701                         }
702                 }
703                 
704                 //names of singletons
705                 if (nameMap.size() != 0) {
706                         singleton = namefile + ".extra.temp";
707                         ofstream remainingNames(singleton.c_str(), ios::ate);
708                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
709                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
710                         }
711                         remainingNames.close();
712                 }else { singleton = "none"; }
713                         
714                 for(int i=0;i<numGroups;i++){
715                         if(groups[i].size() > 0){
716                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
717                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
718                                 
719                                 map<string, string> temp;
720                                 temp[tempDistFile] = tempNameFile;
721                                 dists.push_back(temp);
722                         }
723                 }
724                 
725                 if (m->control_pressed)  {  
726                         for (int i = 0; i < dists.size(); i++) { 
727                                 remove((dists[i].begin()->first).c_str());
728                                 remove((dists[i].begin()->second).c_str());
729                         }
730                         dists.clear();
731                 }
732                 
733                 return 0;
734         }
735         catch(exception& e) {
736                 m->errorOut(e, "SplitMatrix", "splitNames");
737                 exit(1);
738         }
739 }
740 //********************************************************************************************************************
741 int SplitMatrix::splitDistanceRAM(){
742         try {
743                 vector<set<string> > groups;
744                 vector<string> outputs;
745                 
746                 int numGroups = 0;
747
748                 ifstream dFile;
749                 m->openInputFile(distFile, dFile);
750
751                 while(dFile){
752                         string seqA, seqB;
753                         float dist;
754
755                         dFile >> seqA >> seqB >> dist;
756                         
757                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
758                                         
759                         if(dist < cutoff){
760                                 //cout << "in cutoff: " << dist << endl;
761                                 int groupIDA = -1;
762                                 int groupIDB = -1;
763                                 int groupID = -1;
764                                 
765                                 for(int i=0;i<numGroups;i++){
766                                         set<string>::iterator aIt = groups[i].find(seqA);
767                                         set<string>::iterator bIt = groups[i].find(seqB);
768                                         
769                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
770                                                 groups[i].insert(seqB);
771                                                 groupIDA = i;
772                                                 groupID = groupIDA;
773
774                                                 //cout << "in aIt: " << groupID << endl;
775         //                                      break;
776                                         }
777                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
778                                                 groups[i].insert(seqA);
779                                                 groupIDB = i;
780                                                 groupID = groupIDB;
781
782                                         //      cout << "in bIt: " << groupID << endl;
783         //                                      break;
784                                         }
785                                 
786                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
787                                                 if(groupIDA < groupIDB){
788                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
789                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
790                                                         groups[groupIDB].clear(); 
791                                                         groupID = groupIDA;
792                                                 }
793                                                 else{
794                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
795                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
796                                                         groups[groupIDA].clear();  
797                                                         groupID = groupIDB;
798                                                 }
799                                                 break;
800                                         }
801                                 }
802                                 
803         //windows is gonna gag on the reuse of outFile, will need to make it local...
804                                 
805                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
806                                         set<string> newGroup;
807                                         newGroup.insert(seqA);
808                                         newGroup.insert(seqB);
809                                         groups.push_back(newGroup);
810                                                                         
811                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
812                                         outputs.push_back(tempOut);
813                                         numGroups++;
814                                 }
815                                 else{
816                                                                                         
817                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
818                                         
819                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
820                                                 string row, column, distance;
821                                                 if(groupIDA<groupIDB){
822                                                         //merge memory
823                                                         outputs[groupID] += outputs[groupIDB];
824                                                         outputs[groupIDB] = "";
825                                                 }else{
826                                                         outputs[groupID] += outputs[groupIDA];
827                                                         outputs[groupIDA] = "";
828                                                 }                                       
829                                         }
830                                 }
831                         }
832                         m->gobble(dFile);
833                 }
834                 dFile.close();
835                 
836                 for (int i = 0; i < numGroups; i++) {
837                         if (outputs[i] != "") {
838                                 ofstream outFile;
839                                 string fileName = distFile + "." + toString(i) + ".temp";
840                                 outFile.open(fileName.c_str(), ios::ate);
841                                 outFile << outputs[i];
842                                 outFile.close();
843                         }
844                 }
845
846                 splitNames(groups);
847                                 
848                 return 0;                       
849         }
850         catch(exception& e) {
851                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
852                 exit(1);
853         }
854 }
855 //********************************************************************************************************************
856 //sorts biggest to smallest
857 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
858         
859         FILE * pFile;
860         long leftsize = 0;
861                 
862         //get num bytes in file
863         string filename = left.begin()->first;
864         pFile = fopen (filename.c_str(),"rb");
865         string error = "Error opening " + filename;
866         if (pFile==NULL) perror (error.c_str());
867         else{
868                 fseek (pFile, 0, SEEK_END);
869                 leftsize=ftell (pFile);
870                 fclose (pFile);
871         }
872
873         FILE * pFile2;
874         long rightsize = 0;
875                 
876         //get num bytes in file
877         filename = right.begin()->first;
878         pFile2 = fopen (filename.c_str(),"rb");
879         error = "Error opening " + filename;
880         if (pFile2==NULL) perror (error.c_str());
881         else{
882                 fseek (pFile2, 0, SEEK_END);
883                 rightsize=ftell (pFile2);
884                 fclose (pFile2);
885         }
886
887         return (leftsize > rightsize);  
888
889 /***********************************************************************/
890 //returns map of distance files -> namefile sorted by distance file size
891 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
892         try {   
893                 
894                 sort(dists.begin(), dists.end(), compareFileSizes);
895                 
896                 return dists;
897         }
898         catch(exception& e) {
899                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
900                 exit(1);
901         }
902 }
903 /***********************************************************************/
904 SplitMatrix::~SplitMatrix(){}
905 /***********************************************************************/
906