]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
Merge branch 'master' of https://github.com/mothur/mothur.git
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
14
15 /***********************************************************************/
16
17 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
18         m = MothurOut::getInstance();
19         distFile = distfile;
20         cutoff = c;
21         namefile = name;
22         method = t;
23         taxFile = tax;
24         large = l;
25 }
26 /***********************************************************************/
27
28 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, bool cl, string output){
29         m = MothurOut::getInstance();
30         fastafile = ffile;
31         namefile = name;
32         taxFile = tax;
33         cutoff = c;  //tax level cutoff
34         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
35         method = t;
36         processors = p;
37     classic = cl;
38         outputDir = output;
39 }
40
41 /***********************************************************************/
42
43 int SplitMatrix::split(){
44         try {
45         
46                 if (method == "distance") {  
47                         splitDistance();
48                 }else if ((method == "classify") || (method == "fasta")) {
49                         splitClassify();
50                 }else {
51                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
52                         map<string, string> temp;
53                         temp[distFile] = namefile;
54                         dists.push_back(temp);
55                 }
56                 
57                 return 0;
58         }
59         catch(exception& e) {
60                 m->errorOut(e, "SplitMatrix", "split");
61                 exit(1);
62         }
63 }
64 /***********************************************************************/
65 int SplitMatrix::splitDistance(){
66         try {
67         
68                 if (large)      { splitDistanceLarge(); }
69                 else            { splitDistanceRAM();   }
70                 
71                 return 0;
72                         
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "SplitMatrix", "splitDistance");
76                 exit(1);
77         }
78 }
79
80 /***********************************************************************/
81 int SplitMatrix::splitClassify(){
82         try {
83                 cutoff = int(cutoff);
84                                 
85                 map<string, int> seqGroup;
86                 map<string, int>::iterator it;
87                 map<string, int>::iterator it2;
88                 
89                 int numGroups = 0;
90                 
91                 //build tree from users taxonomy file
92                 PhyloTree* phylo = new PhyloTree();
93                 
94                 ifstream in;
95                 m->openInputFile(taxFile, in);
96                         
97                 //read in users taxonomy file and add sequences to tree
98                 string seqname, tax;
99                 while(!in.eof()){
100                         in >> seqname >> tax; m->gobble(in);
101                         phylo->addSeqToTree(seqname, tax);
102                 }
103                 in.close();
104                 
105                 phylo->assignHeirarchyIDs(0);
106
107                 //make sure the cutoff is not greater than maxlevel
108                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
109         
110                 //for each node in tree
111                 for (int i = 0; i < phylo->getNumNodes(); i++) {
112                 
113                         //is this node within the cutoff
114                         TaxNode taxon = phylo->get(i);
115         
116                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
117                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
118                                         for (int j = 0; j < taxon.accessions.size(); j++) {
119                                                 seqGroup[taxon.accessions[j]] = numGroups;
120                                         }
121                                         numGroups++;
122                                 }
123                         }
124                 }
125         
126                 delete phylo;
127                 
128                 if (method == "classify") {
129                         splitDistanceFileByTax(seqGroup, numGroups);
130                 }else {
131                         createDistanceFilesFromTax(seqGroup, numGroups);
132                 }
133                 
134                 return 0;
135                         
136         }
137         catch(exception& e) {
138                 m->errorOut(e, "SplitMatrix", "splitClassify");
139                 exit(1);
140         }
141 }
142 /***********************************************************************/
143 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
144         try {
145                 map<string, int> copyGroups = seqGroup;
146                 map<string, int>::iterator it;
147                 set<string> names;
148                                 
149                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
150                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
151                 }
152                         
153                 ifstream in;
154                 m->openInputFile(fastafile, in);
155         
156                 //parse fastafile
157                 ofstream outFile;
158                 while (!in.eof()) {
159                         Sequence query(in); m->gobble(in);
160                         if (query.getName() != "") {
161                 
162                                 it = seqGroup.find(query.getName());
163                                 
164                                 //save names in case no namefile is given
165                                 if (namefile == "") {  names.insert(query.getName()); }
166                         
167                                 if (it != seqGroup.end()) { //not singleton 
168                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
169                                         query.printSequence(outFile); 
170                                         outFile.close();
171                                         
172                                         copyGroups.erase(query.getName());
173                                 }
174                         }
175                 }
176                 in.close();
177                 
178                 //warn about sequence in groups that are not in fasta file
179                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
180                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
181                         exit(1);
182                 }
183                 
184                 copyGroups.clear();
185                 
186                 //process each distance file
187                 for (int i = 0; i < numGroups; i++) { 
188                         
189                         string options = "";
190             if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
191             else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
192                         if (outputDir != "") { options += ", outputdir=" + outputDir; }
193                         
194                         Command* command = new DistanceCommand(options);
195                         
196                         command->execute();
197                         delete command;
198                         
199                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
200                         
201                         //remove old names files just in case
202                         m->mothurRemove((namefile + "." + toString(i) + ".temp"));
203                 }
204                         
205                 singleton = namefile + ".extra.temp";
206                 ofstream remainingNames;
207                 m->openOutputFile(singleton, remainingNames);
208                 
209                 bool wroteExtra = false;
210
211                 ifstream bigNameFile;
212                 m->openInputFile(namefile, bigNameFile);
213                 
214                 string name, nameList;
215                 while(!bigNameFile.eof()){
216                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
217                         
218                         //did this sequence get assigned a group
219                         it = seqGroup.find(name);
220                         
221                         if (it != seqGroup.end()) {  
222                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
223                                 outFile << name << '\t' << nameList << endl;
224                                 outFile.close();
225                         }else{
226                                 wroteExtra = true;
227                                 remainingNames << name << '\t' << nameList << endl;
228                         }
229                 }
230                 bigNameFile.close();
231                 
232                 for(int i=0;i<numGroups;i++){
233                         string tempNameFile = namefile + "." + toString(i) + ".temp";
234                         if (outputDir == "") { outputDir = m->hasPath(fastafile); }
235                         string tempDistFile = "";
236             if (classic) { tempDistFile =  outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
237             else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
238
239                         //if there are valid distances
240                         ifstream fileHandle;
241                         fileHandle.open(tempDistFile.c_str());
242                         if(fileHandle)  {       
243                                 m->gobble(fileHandle);
244                                 if (!fileHandle.eof()) {  //check for blank file - this could occur if all dists in group are above cutoff
245                                         map<string, string> temp;
246                                         temp[tempDistFile] = tempNameFile;
247                                         dists.push_back(temp);
248                                 }else {
249                                         ifstream in;
250                                         m->openInputFile(tempNameFile, in);
251                                 
252                                         while(!in.eof()) { 
253                                                 in >> name >> nameList;  m->gobble(in);
254                                                 wroteExtra = true;
255                                                 remainingNames << name << '\t' << nameList << endl;
256                                         }
257                                         in.close();
258                                         m->mothurRemove(tempNameFile);
259                                 }
260                         }
261                         fileHandle.close();
262                 }
263                 
264                 remainingNames.close();
265                 if (!wroteExtra) { 
266                         m->mothurRemove(singleton);
267                         singleton = "none";
268                 }
269
270                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
271                 
272                 return 0;
273         }
274         catch(exception& e) {
275                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
276                 exit(1);
277         }
278 }
279 /***********************************************************************/
280 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
281         try {
282                 map<string, int>::iterator it;
283                 map<string, int>::iterator it2;
284                 
285                 ifstream dFile;
286                 m->openInputFile(distFile, dFile);
287                 ofstream outFile;
288                 
289                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
290                         m->mothurRemove((distFile + "." + toString(i) + ".temp"));
291                 }
292                 
293                 //for buffering the io to improve speed
294                  //allow for 10 dists to be stored, then output.
295                 vector<string> outputs;  outputs.resize(numGroups, "");
296                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
297                 
298                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
299                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
300                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
301                 
302                 //for each distance
303                 while(dFile){
304                         string seqA, seqB;
305                         float dist;
306                         
307                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp"));       } }
308                         
309                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
310                         
311                         //if both sequences are in the same group then they are within the cutoff
312                         it = seqGroup.find(seqA);
313                         it2 = seqGroup.find(seqB);
314                         
315                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
316                                 if (it->second == it2->second) { //they are from the same group so add the distance
317                                         if (numOutputs[it->second] > 30) {
318                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
319                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
320                                                 outFile.close();
321                                                 outputs[it->second] = "";
322                                                 numOutputs[it->second] = 0;
323                                                 validDistances[it->second] = true;
324                                         }else{
325                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
326                                                 numOutputs[it->second]++;
327                                         }
328                                 }
329                         }
330                 }
331                 dFile.close();
332         
333                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
334                         m->mothurRemove((namefile + "." + toString(i) + ".temp"));
335                         
336                         //write out any remaining buffers
337                         if (numOutputs[i] > 0) {
338                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
339                                 outFile << outputs[i];
340                                 outFile.close();
341                                 outputs[i] = "";
342                                 numOutputs[i] = 0;
343                                 validDistances[i] = true;
344                         }
345                 }
346                 
347                 ifstream bigNameFile;
348                 m->openInputFile(namefile, bigNameFile);
349                 
350                 singleton = namefile + ".extra.temp";
351                 ofstream remainingNames;
352                 m->openOutputFile(singleton, remainingNames);
353                 
354                 bool wroteExtra = false;
355                                                 
356                 string name, nameList;
357                 while(!bigNameFile.eof()){
358                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
359                         
360                         //did this sequence get assigned a group
361                         it = seqGroup.find(name);
362                         
363                         if (it != seqGroup.end()) {  
364                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
365                                 outFile << name << '\t' << nameList << endl;
366                                 outFile.close();
367                         }else{
368                                 wroteExtra = true;
369                                 remainingNames << name << '\t' << nameList << endl;
370                         }
371                 }
372                 bigNameFile.close();
373                                 
374                 for(int i=0;i<numGroups;i++){
375                         string tempNameFile = namefile + "." + toString(i) + ".temp";
376                         string tempDistFile = distFile + "." + toString(i) + ".temp";
377
378                         //if there are valid distances
379                         if (validDistances[i]) {
380                                 map<string, string> temp;
381                                 temp[tempDistFile] = tempNameFile;
382                                 dists.push_back(temp);
383                         }else{
384                                 ifstream in;
385                                 m->openInputFile(tempNameFile, in);
386                                 
387                                 while(!in.eof()) { 
388                                         in >> name >> nameList;  m->gobble(in);
389                                         wroteExtra = true;
390                                         remainingNames << name << '\t' << nameList << endl;
391                                 }
392                                 in.close();
393                                 m->mothurRemove(tempNameFile);
394                         }
395                 }
396                 
397                 remainingNames.close();
398                 
399                 if (!wroteExtra) { 
400                         m->mothurRemove(singleton);
401                         singleton = "none";
402                 }
403
404                 if (m->control_pressed)  {  
405                         for (int i = 0; i < dists.size(); i++) { 
406                                 m->mothurRemove((dists[i].begin()->first));
407                                 m->mothurRemove((dists[i].begin()->second));
408                         }
409                         dists.clear();
410                 }
411                 
412                 return 0;
413         }
414         catch(exception& e) {
415                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
416                 exit(1);
417         }
418 }
419 /***********************************************************************/
420 int SplitMatrix::splitDistanceLarge(){
421         try {
422                 vector<set<string> > groups;
423                 
424                 //for buffering the io to improve speed
425                  //allow for 30 dists to be stored, then output.
426                 vector<string> outputs;
427                 vector<int> numOutputs;
428                 vector<bool> wroteOutPut;
429                 
430                 int numGroups = 0;
431
432                 ofstream outFile;
433                 ifstream dFile;
434                 m->openInputFile(distFile, dFile);
435         
436                 while(dFile){
437                         string seqA, seqB;
438                         float dist;
439
440                         dFile >> seqA >> seqB >> dist;
441                         
442                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
443                                         
444                         if(dist < cutoff){
445                                 //cout << "in cutoff: " << dist << endl;
446                                 int groupIDA = -1;
447                                 int groupIDB = -1;
448                                 int groupID = -1;
449                                 
450                                 for(int i=0;i<numGroups;i++){
451                                         set<string>::iterator aIt = groups[i].find(seqA);
452                                         set<string>::iterator bIt = groups[i].find(seqB);
453                                         
454                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
455                                                 groups[i].insert(seqB);
456                                                 groupIDA = i;
457                                                 groupID = groupIDA;
458
459                                                 //cout << "in aIt: " << groupID << endl;
460         //                                      break;
461                                         }
462                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
463                                                 groups[i].insert(seqA);
464                                                 groupIDB = i;
465                                                 groupID = groupIDB;
466
467                                         //      cout << "in bIt: " << groupID << endl;
468         //                                      break;
469                                         }
470                                 
471                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
472                                                 if(groupIDA < groupIDB){
473                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
474                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
475                                                         groups[groupIDB].clear(); 
476                                                         groupID = groupIDA;
477                                                 }
478                                                 else{
479                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
480                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
481                                                         groups[groupIDA].clear();  
482                                                         groupID = groupIDB;
483                                                 }
484                                                 break;
485                                         }
486                                 }
487                                 
488         //windows is gonna gag on the reuse of outFile, will need to make it local...
489                                 
490                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
491                                         set<string> newGroup;
492                                         newGroup.insert(seqA);
493                                         newGroup.insert(seqB);
494                                         groups.push_back(newGroup);
495                                                                         
496                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
497                                         outputs.push_back(tempOut);
498                                         numOutputs.push_back(1);
499                                         wroteOutPut.push_back(false);
500                                         
501                                         numGroups++;
502                                 }
503                                 else{
504                                         string fileName = distFile + "." + toString(groupID) + ".temp";
505                                                                                         
506                                         //have we reached the max buffer size
507                                         if (numOutputs[groupID] > 60) { //write out sequence
508                                                 outFile.open(fileName.c_str(), ios::app);
509                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
510                                                 outFile.close();
511                                                 
512                                                 outputs[groupID] = "";
513                                                 numOutputs[groupID] = 0;
514                                                 wroteOutPut[groupID] = true;
515                                         }else {
516                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
517                                                 numOutputs[groupID]++;
518                                         }
519                                         
520                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
521                                                 string row, column, distance;
522                                                 if(groupIDA<groupIDB){
523                                                         
524                                                         //merge memory
525                                                         numOutputs[groupID] += numOutputs[groupIDB];
526                                                         outputs[groupID] += outputs[groupIDB];
527                                                         
528                                                         outputs[groupIDB] = "";
529                                                         numOutputs[groupIDB] = 0;
530                                                         
531                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
532                                                         if (wroteOutPut[groupIDB]) {
533                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
534                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
535                                                                 
536                                                                 outFile.open(fileName.c_str(), ios::app);
537                                                                 
538                                                                 long size;
539                                                                 char* memblock;
540
541                                                                 size = fileB.tellg();
542                                 
543                                                                 fileB.seekg (0, ios::beg);
544                                                                 
545                                                                 int numRead = size / 1024;
546                                                                 int lastRead = size % 1024;
547
548                                                                 for (int i = 0; i < numRead; i++) {
549                                 
550                                                                         memblock = new char [1024];
551                                                                 
552                                                                         fileB.read (memblock, 1024);
553                                                                         
554                                                                         string temp = memblock;
555                                                                         outFile << temp.substr(0, 1024);
556                                                                         
557                                                                         delete memblock;
558                                                                 }
559                                                                 
560                                                                 memblock = new char [lastRead];
561                                                                 
562                                                                 fileB.read (memblock, lastRead);
563                                                                 
564                                                                 //not sure why but it will read more than lastRead char...??
565                                                                 string temp = memblock;
566                                                                 outFile << temp.substr(0, lastRead);
567                                                                 delete memblock;
568                                                                 
569                                                                 fileB.close();
570                                                                 m->mothurRemove(fileName2);
571                                                                 
572                                                                 //write out the merged memory
573                                                                 if (numOutputs[groupID] > 60) {
574                                                                         outFile << outputs[groupID];
575                                                                         outputs[groupID] = "";
576                                                                         numOutputs[groupID] = 0;
577                                                                 }
578                                                                 
579                                                                 outFile.close();
580                                                                 
581                                                                 wroteOutPut[groupID] = true;
582                                                                 wroteOutPut[groupIDB] = false;
583                                                         }else{ } //just merge b's memory with a's memory 
584                                                 }
585                                                 else{
586                                                         numOutputs[groupID] += numOutputs[groupIDA];
587                                                         outputs[groupID] += outputs[groupIDA];
588                                                         
589                                                         outputs[groupIDA] = "";
590                                                         numOutputs[groupIDA] = 0;
591                                                         
592                                                         if (wroteOutPut[groupIDA]) {
593                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
594                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
595                                                                 
596                                                                 outFile.open(fileName.c_str(), ios::app);
597                                                                 
598                                                                 long size;
599                                                                 char* memblock;
600
601                                                                 size = fileB.tellg();
602                                                                                                                         
603                                                                 fileB.seekg (0, ios::beg);
604                                                                 
605                                                                 int numRead = size / 1024;
606                                                                 int lastRead = size % 1024;
607
608                                                                 for (int i = 0; i < numRead; i++) {
609                                 
610                                                                         memblock = new char [1024];
611                                                                 
612                                                                         fileB.read (memblock, 1024);
613                                                                         string temp = memblock;
614                                                                         outFile << temp.substr(0, 1024);
615                                                                         
616                                                                         delete memblock;
617                                                                 }
618                                                                 
619                                                                 memblock = new char [lastRead];
620                                                                 
621                                                                 fileB.read (memblock, lastRead);
622                                                                 
623                                                                 //not sure why but it will read more than lastRead char...??
624                                                                 string temp = memblock;
625                                                                 outFile << temp.substr(0, lastRead);
626                                                                         
627                                                                 delete memblock;
628                                                                 
629                                                                 fileB.close();
630                                                                 m->mothurRemove(fileName2);
631                                                                 
632                                                                 //write out the merged memory
633                                                                 if (numOutputs[groupID] > 60) {
634                                                                         outFile << outputs[groupID];
635                                                                         outputs[groupID] = "";
636                                                                         numOutputs[groupID] = 0;
637                                                                 }
638                                                                 
639                                                                 outFile.close();
640                                                                 
641                                                                 wroteOutPut[groupID] = true;
642                                                                 wroteOutPut[groupIDA] = false;
643                                                         }else { } //just merge memory
644                                                 }                                       
645                                         }
646                                 }
647                         }
648                         m->gobble(dFile);
649                 }
650                 dFile.close();
651                 
652                 for (int i = 0; i < numGroups; i++) {
653                         if (numOutputs[i] > 0) {
654                                 string fileName = distFile + "." + toString(i) + ".temp";
655                                 outFile.open(fileName.c_str(), ios::app);
656                                 outFile << outputs[i];
657                                 outFile.close();
658                         }
659                 }
660
661                 splitNames(groups);
662                                 
663                 return 0;                       
664         }
665         catch(exception& e) {
666                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
667                 exit(1);
668         }
669 }
670 //********************************************************************************************************************
671 int SplitMatrix::splitNames(vector<set<string> >& groups){
672         try {
673                 int numGroups = groups.size();
674         
675                 ifstream bigNameFile(namefile.c_str());
676                 if(!bigNameFile){
677                         cerr << "Error: We can't open the name file\n";
678                         exit(1);
679                 }
680                 
681                 map<string, string> nameMap;
682                 string name, nameList;
683                 while(bigNameFile){
684                         bigNameFile >> name >> nameList;
685                         nameMap[name] = nameList;
686                         m->gobble(bigNameFile);
687                 }
688                 bigNameFile.close();
689                         
690                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
691                         int numSeqsInGroup = groups[i].size();
692                         
693                         if(numSeqsInGroup > 0){
694                                 string fileName = namefile + "." + toString(i) + ".temp";
695                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
696                                 
697                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
698                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
699                                         if (nIt != nameMap.end()) {
700                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
701                                                 nameMap.erase(nIt);
702                                         }else{
703                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
704                                         }
705                                 }
706                                 smallNameFile.close();
707                         }
708                 }
709                 
710                 //names of singletons
711                 if (nameMap.size() != 0) {
712                         singleton = namefile + ".extra.temp";
713                         ofstream remainingNames(singleton.c_str(), ios::ate);
714                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
715                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
716                         }
717                         remainingNames.close();
718                 }else { singleton = "none"; }
719                         
720                 for(int i=0;i<numGroups;i++){
721                         if(groups[i].size() > 0){
722                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
723                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
724                                 
725                                 map<string, string> temp;
726                                 temp[tempDistFile] = tempNameFile;
727                                 dists.push_back(temp);
728                         }
729                 }
730                 
731                 if (m->control_pressed)  {  
732                         for (int i = 0; i < dists.size(); i++) { 
733                                 m->mothurRemove((dists[i].begin()->first));
734                                 m->mothurRemove((dists[i].begin()->second));
735                         }
736                         dists.clear();
737                 }
738                 
739                 return 0;
740         }
741         catch(exception& e) {
742                 m->errorOut(e, "SplitMatrix", "splitNames");
743                 exit(1);
744         }
745 }
746 //********************************************************************************************************************
747 int SplitMatrix::splitDistanceRAM(){
748         try {
749                 vector<set<string> > groups;
750                 vector<string> outputs;
751                 
752                 int numGroups = 0;
753
754                 ifstream dFile;
755                 m->openInputFile(distFile, dFile);
756
757                 while(dFile){
758                         string seqA, seqB;
759                         float dist;
760
761                         dFile >> seqA >> seqB >> dist;
762                         
763                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
764                                         
765                         if(dist < cutoff){
766                                 //cout << "in cutoff: " << dist << endl;
767                                 int groupIDA = -1;
768                                 int groupIDB = -1;
769                                 int groupID = -1;
770                                 
771                                 for(int i=0;i<numGroups;i++){
772                                         set<string>::iterator aIt = groups[i].find(seqA);
773                                         set<string>::iterator bIt = groups[i].find(seqB);
774                                         
775                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
776                                                 groups[i].insert(seqB);
777                                                 groupIDA = i;
778                                                 groupID = groupIDA;
779
780                                                 //cout << "in aIt: " << groupID << endl;
781         //                                      break;
782                                         }
783                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
784                                                 groups[i].insert(seqA);
785                                                 groupIDB = i;
786                                                 groupID = groupIDB;
787
788                                         //      cout << "in bIt: " << groupID << endl;
789         //                                      break;
790                                         }
791                                 
792                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
793                                                 if(groupIDA < groupIDB){
794                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
795                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
796                                                         groups[groupIDB].clear(); 
797                                                         groupID = groupIDA;
798                                                 }
799                                                 else{
800                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
801                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
802                                                         groups[groupIDA].clear();  
803                                                         groupID = groupIDB;
804                                                 }
805                                                 break;
806                                         }
807                                 }
808                                 
809         //windows is gonna gag on the reuse of outFile, will need to make it local...
810                                 
811                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
812                                         set<string> newGroup;
813                                         newGroup.insert(seqA);
814                                         newGroup.insert(seqB);
815                                         groups.push_back(newGroup);
816                                                                         
817                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
818                                         outputs.push_back(tempOut);
819                                         numGroups++;
820                                 }
821                                 else{
822                                                                                         
823                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
824                                         
825                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
826                                                 string row, column, distance;
827                                                 if(groupIDA<groupIDB){
828                                                         //merge memory
829                                                         outputs[groupID] += outputs[groupIDB];
830                                                         outputs[groupIDB] = "";
831                                                 }else{
832                                                         outputs[groupID] += outputs[groupIDA];
833                                                         outputs[groupIDA] = "";
834                                                 }                                       
835                                         }
836                                 }
837                         }
838                         m->gobble(dFile);
839                 }
840                 dFile.close();
841                 
842                 for (int i = 0; i < numGroups; i++) {
843                         if (outputs[i] != "") {
844                                 ofstream outFile;
845                                 string fileName = distFile + "." + toString(i) + ".temp";
846                                 outFile.open(fileName.c_str(), ios::ate);
847                                 outFile << outputs[i];
848                                 outFile.close();
849                         }
850                 }
851
852                 splitNames(groups);
853                                 
854                 return 0;                       
855         }
856         catch(exception& e) {
857                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
858                 exit(1);
859         }
860 }
861 //********************************************************************************************************************
862 //sorts biggest to smallest
863 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
864         
865         FILE * pFile;
866         long leftsize = 0;
867                 
868         //get num bytes in file
869         string filename = left.begin()->first;
870         pFile = fopen (filename.c_str(),"rb");
871         string error = "Error opening " + filename;
872         if (pFile==NULL) perror (error.c_str());
873         else{
874                 fseek (pFile, 0, SEEK_END);
875                 leftsize=ftell (pFile);
876                 fclose (pFile);
877         }
878
879         FILE * pFile2;
880         long rightsize = 0;
881                 
882         //get num bytes in file
883         filename = right.begin()->first;
884         pFile2 = fopen (filename.c_str(),"rb");
885         error = "Error opening " + filename;
886         if (pFile2==NULL) perror (error.c_str());
887         else{
888                 fseek (pFile2, 0, SEEK_END);
889                 rightsize=ftell (pFile2);
890                 fclose (pFile2);
891         }
892
893         return (leftsize > rightsize);  
894
895 /***********************************************************************/
896 //returns map of distance files -> namefile sorted by distance file size
897 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
898         try {   
899                 
900                 sort(dists.begin(), dists.end(), compareFileSizes);
901                 
902                 return dists;
903         }
904         catch(exception& e) {
905                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
906                 exit(1);
907         }
908 }
909 /***********************************************************************/
910 SplitMatrix::~SplitMatrix(){}
911 /***********************************************************************/
912