]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
changed random forest output filename
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
14
15 /***********************************************************************/
16
17 SplitMatrix::SplitMatrix(string distfile, string name, string count, string tax, float c, string t, bool l){
18         m = MothurOut::getInstance();
19         distFile = distfile;
20         cutoff = c;
21         namefile = name;
22         method = t;
23         taxFile = tax;
24     countfile = count;
25         large = l;
26 }
27 /***********************************************************************/
28
29 SplitMatrix::SplitMatrix(string ffile, string name, string count, string tax, float c, float cu, string t, int p, bool cl, string output){
30         m = MothurOut::getInstance();
31         fastafile = ffile;
32         namefile = name;
33     countfile = count;
34         taxFile = tax;
35         cutoff = c;  //tax level cutoff
36         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
37         method = t;
38         processors = p;
39     classic = cl;
40         outputDir = output;
41 }
42
43 /***********************************************************************/
44
45 int SplitMatrix::split(){
46         try {
47         
48                 if (method == "distance") {  
49                         splitDistance();
50                 }else if ((method == "classify") || (method == "fasta")) {
51                         splitClassify();
52                 }else {
53                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
54                         map<string, string> temp;
55                         if (namefile != "") {  temp[distFile] = namefile; }
56             else { temp[distFile] = countfile; }
57                         dists.push_back(temp);
58                 }
59                 
60                 return 0;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "SplitMatrix", "split");
64                 exit(1);
65         }
66 }
67 /***********************************************************************/
68 int SplitMatrix::splitDistance(){
69         try {
70         
71                 if (large)      { splitDistanceLarge(); }
72                 else            { splitDistanceRAM();   }
73                 
74                 return 0;
75                         
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "SplitMatrix", "splitDistance");
79                 exit(1);
80         }
81 }
82
83 /***********************************************************************/
84 int SplitMatrix::splitClassify(){
85         try {
86                 cutoff = int(cutoff);
87                                 
88                 map<string, int> seqGroup;
89                 map<string, int>::iterator it;
90                 map<string, int>::iterator it2;
91                 
92                 int numGroups = 0;
93                 
94                 //build tree from users taxonomy file
95                 PhyloTree* phylo = new PhyloTree();
96                 
97         map<string, string> temp;
98         m->readTax(taxFile, temp);
99         
100         for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
101             phylo->addSeqToTree(itTemp->first, itTemp->second);
102             temp.erase(itTemp++);
103         }
104                 
105                 phylo->assignHeirarchyIDs(0);
106
107                 //make sure the cutoff is not greater than maxlevel
108                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
109         
110                 //for each node in tree
111                 for (int i = 0; i < phylo->getNumNodes(); i++) {
112                 
113                         //is this node within the cutoff
114                         TaxNode taxon = phylo->get(i);
115         
116                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
117                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
118                                         for (int j = 0; j < taxon.accessions.size(); j++) {
119                                                 seqGroup[taxon.accessions[j]] = numGroups;
120                                         }
121                                         numGroups++;
122                                 }
123                         }
124                 }
125         
126                 delete phylo;
127                 
128                 if (method == "classify") {
129                         splitDistanceFileByTax(seqGroup, numGroups);
130                 }else {
131                         createDistanceFilesFromTax(seqGroup, numGroups);
132                 }
133                 
134                 return 0;
135                         
136         }
137         catch(exception& e) {
138                 m->errorOut(e, "SplitMatrix", "splitClassify");
139                 exit(1);
140         }
141 }
142 /***********************************************************************/
143 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
144         try {
145                 map<string, int> copyGroups = seqGroup;
146                 map<string, int>::iterator it;
147                 set<string> names;
148                                 
149                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
150                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
151                 }
152                         
153                 ifstream in;
154                 m->openInputFile(fastafile, in);
155         
156                 //parse fastafile
157                 ofstream outFile;
158                 while (!in.eof()) {
159                         Sequence query(in); m->gobble(in);
160                         if (query.getName() != "") {
161                 
162                                 it = seqGroup.find(query.getName());
163                                 
164                                 //save names in case no namefile is given
165                                 if ((namefile == "") && (countfile == "")) {  names.insert(query.getName()); }
166                         
167                                 if (it != seqGroup.end()) { //not singleton 
168                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
169                                         query.printSequence(outFile); 
170                                         outFile.close();
171                                         
172                                         copyGroups.erase(query.getName());
173                                 }
174                         }
175                 }
176                 in.close();
177                 
178                 //warn about sequence in groups that are not in fasta file
179                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
180                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
181                         exit(1);
182                 }
183                 
184                 copyGroups.clear();
185         
186                 //process each distance file
187                 for (int i = 0; i < numGroups; i++) { 
188                         
189                         string options = "";
190             if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
191             else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
192                         if (outputDir != "") { options += ", outputdir=" + outputDir; }
193                         
194             m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
195             
196                         Command* command = new DistanceCommand(options);
197                         
198             m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
199             
200                         command->execute();
201                         delete command;
202                         
203                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
204                         
205                         //remove old names files just in case
206                         if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); }
207             else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); }
208                 }
209         
210         //restore old fasta file name since dist.seqs overwrites it with the temp files
211         m->setFastaFile(fastafile);
212         
213         vector<string> tempDistFiles;    
214         for(int i=0;i<numGroups;i++){
215             if (outputDir == "") { outputDir = m->hasPath(fastafile); }
216             string tempDistFile = "";
217             if (classic) { tempDistFile =  outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
218             else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
219             tempDistFiles.push_back(tempDistFile);
220         }
221         
222         splitNames(seqGroup, numGroups, tempDistFiles);
223         
224                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
225                 
226                 return 0;
227         }
228         catch(exception& e) {
229                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
230                 exit(1);
231         }
232 }
233 /***********************************************************************/
234 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
235         try {
236                 map<string, int>::iterator it;
237                 map<string, int>::iterator it2;
238                 
239         ofstream outFile;
240                 ifstream dFile;
241                 m->openInputFile(distFile, dFile);
242                 
243                 
244                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
245                         m->mothurRemove((distFile + "." + toString(i) + ".temp"));
246                 }
247                 
248                 //for buffering the io to improve speed
249                  //allow for 10 dists to be stored, then output.
250                 vector<string> outputs;  outputs.resize(numGroups, "");
251                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
252                 
253                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
254                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
255                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
256                 
257                 //for each distance
258                 while(dFile){
259                         string seqA, seqB;
260                         float dist;
261                         
262                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp"));       } }
263                         
264                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
265                         
266                         //if both sequences are in the same group then they are within the cutoff
267                         it = seqGroup.find(seqA);
268                         it2 = seqGroup.find(seqB);
269                         
270                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
271                                 if (it->second == it2->second) { //they are from the same group so add the distance
272                                         if (numOutputs[it->second] > 30) {
273                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
274                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
275                                                 outFile.close();
276                                                 outputs[it->second] = "";
277                                                 numOutputs[it->second] = 0;
278                                                 validDistances[it->second] = true;
279                                         }else{
280                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
281                                                 numOutputs[it->second]++;
282                                         }
283                                 }
284                         }
285                 }
286                 dFile.close();
287         
288         string inputFile = namefile;
289         if (countfile != "") { inputFile = countfile; }
290         
291         vector<string> tempDistFiles;
292                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
293             string tempDistFile = distFile + "." + toString(i) + ".temp";
294             tempDistFiles.push_back(tempDistFile);
295                         m->mothurRemove((inputFile + "." + toString(i) + ".temp"));
296                         
297                         //write out any remaining buffers
298                         if (numOutputs[i] > 0) {
299                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
300                                 outFile << outputs[i];
301                                 outFile.close();
302                                 outputs[i] = "";
303                                 numOutputs[i] = 0;
304                                 validDistances[i] = true;
305                         }
306                 }
307                 
308         splitNames(seqGroup, numGroups, tempDistFiles);
309         
310                 if (m->control_pressed)  {  
311                         for (int i = 0; i < dists.size(); i++) { 
312                                 m->mothurRemove((dists[i].begin()->first));
313                                 m->mothurRemove((dists[i].begin()->second));
314                         }
315                         dists.clear();
316                 }
317                 
318                 return 0;
319         }
320         catch(exception& e) {
321                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
322                 exit(1);
323         }
324 }
325 /***********************************************************************/
326 int SplitMatrix::splitDistanceLarge(){
327         try {
328                 vector<set<string> > groups;
329                 
330                 //for buffering the io to improve speed
331                  //allow for 30 dists to be stored, then output.
332                 vector<string> outputs;
333                 vector<int> numOutputs;
334                 vector<bool> wroteOutPut;
335                 
336                 int numGroups = 0;
337
338                 //ofstream outFile;
339                 ifstream dFile;
340                 m->openInputFile(distFile, dFile);
341         
342                 while(dFile){
343                         string seqA, seqB;
344                         float dist;
345
346                         dFile >> seqA >> seqB >> dist;
347                         
348                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
349                                         
350                         if(dist < cutoff){
351                                 //cout << "in cutoff: " << dist << endl;
352                                 int groupIDA = -1;
353                                 int groupIDB = -1;
354                                 int groupID = -1;
355                                 
356                                 for(int i=0;i<numGroups;i++){
357                                         set<string>::iterator aIt = groups[i].find(seqA);
358                                         set<string>::iterator bIt = groups[i].find(seqB);
359                                         
360                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
361                                                 groups[i].insert(seqB);
362                                                 groupIDA = i;
363                                                 groupID = groupIDA;
364
365                                                 //cout << "in aIt: " << groupID << endl;
366         //                                      break;
367                                         }
368                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
369                                                 groups[i].insert(seqA);
370                                                 groupIDB = i;
371                                                 groupID = groupIDB;
372
373                                         //      cout << "in bIt: " << groupID << endl;
374         //                                      break;
375                                         }
376                                 
377                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
378                                                 if(groupIDA < groupIDB){
379                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
380                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
381                                                         groups[groupIDB].clear(); 
382                                                         groupID = groupIDA;
383                                                 }
384                                                 else{
385                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
386                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
387                                                         groups[groupIDA].clear();  
388                                                         groupID = groupIDB;
389                                                 }
390                                                 break;
391                                         }
392                                 }
393                                 
394         //windows is gonna gag on the reuse of outFile, will need to make it local...
395                                 
396                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
397                                         set<string> newGroup;
398                                         newGroup.insert(seqA);
399                                         newGroup.insert(seqB);
400                                         groups.push_back(newGroup);
401                                                                         
402                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
403                                         outputs.push_back(tempOut);
404                                         numOutputs.push_back(1);
405                                         wroteOutPut.push_back(false);
406                                         
407                                         numGroups++;
408                                 }
409                                 else{
410                                         string fileName = distFile + "." + toString(groupID) + ".temp";
411                                                                                         
412                                         //have we reached the max buffer size
413                                         if (numOutputs[groupID] > 60) { //write out sequence
414                         ofstream outFile;
415                                                 outFile.open(fileName.c_str(), ios::app);
416                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
417                                                 outFile.close();
418                                                 
419                                                 outputs[groupID] = "";
420                                                 numOutputs[groupID] = 0;
421                                                 wroteOutPut[groupID] = true;
422                                         }else {
423                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
424                                                 numOutputs[groupID]++;
425                                         }
426                                         
427                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
428                                                 string row, column, distance;
429                                                 if(groupIDA<groupIDB){
430                                                         
431                                                         //merge memory
432                                                         numOutputs[groupID] += numOutputs[groupIDB];
433                                                         outputs[groupID] += outputs[groupIDB];
434                                                         
435                                                         outputs[groupIDB] = "";
436                                                         numOutputs[groupIDB] = 0;
437                                                         
438                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
439                                                         if (wroteOutPut[groupIDB]) {
440                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
441                                                                 /*ifstream fileB(fileName2.c_str(), ios::ate);
442                                                                 
443                                                                 outFile.open(fileName.c_str(), ios::app);
444                                                                 
445                                                                 long size;
446                                                                 char* memblock;
447
448                                                                 size = fileB.tellg();
449                                 
450                                                                 fileB.seekg (0, ios::beg);
451                                                                 
452                                                                 int numRead = size / 1024;
453                                                                 int lastRead = size % 1024;
454
455                                                                 for (int i = 0; i < numRead; i++) {
456                                 
457                                                                         memblock = new char [1024];
458                                                                 
459                                                                         fileB.read (memblock, 1024);
460                                                                         
461                                                                         string temp = memblock;
462                                                                         outFile << temp.substr(0, 1024);
463                                                                         
464                                                                         delete memblock;
465                                                                 }
466                                                                 
467                                                                 memblock = new char [lastRead];
468                                                                 
469                                                                 fileB.read (memblock, lastRead);
470                                                                 
471                                                                 //not sure why but it will read more than lastRead char...??
472                                                                 string temp = memblock;
473                                                                 outFile << temp.substr(0, lastRead);
474                                                                 delete memblock;
475                                                                 
476                                                                 fileB.close();*/
477                                 m->appendFiles(fileName2, fileName);
478                                                                 m->mothurRemove(fileName2);
479                         
480                                                                 
481                                                                 //write out the merged memory
482                                                                 if (numOutputs[groupID] > 60) {
483                                     ofstream tempOut;
484                                     m->openOutputFile(fileName, tempOut);
485                                                                         tempOut << outputs[groupID];
486                                                                         outputs[groupID] = "";
487                                                                         numOutputs[groupID] = 0;
488                                     tempOut.close();
489                                                                 }
490                                                                 
491                                                                 //outFile.close();
492                                                                 
493                                                                 wroteOutPut[groupID] = true;
494                                                                 wroteOutPut[groupIDB] = false;
495                                                         }else{ } //just merge b's memory with a's memory 
496                                                 }
497                                                 else{
498                                                         numOutputs[groupID] += numOutputs[groupIDA];
499                                                         outputs[groupID] += outputs[groupIDA];
500                                                         
501                                                         outputs[groupIDA] = "";
502                                                         numOutputs[groupIDA] = 0;
503                                                         
504                                                         if (wroteOutPut[groupIDA]) {
505                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
506                                                                 /*ifstream fileB(fileName2.c_str(), ios::ate);
507                                                                 
508                                                                 outFile.open(fileName.c_str(), ios::app);
509                                                                 
510                                                                 long size;
511                                                                 char* memblock;
512
513                                                                 size = fileB.tellg();
514                                                                                                                         
515                                                                 fileB.seekg (0, ios::beg);
516                                                                 
517                                                                 int numRead = size / 1024;
518                                                                 int lastRead = size % 1024;
519
520                                                                 for (int i = 0; i < numRead; i++) {
521                                 
522                                                                         memblock = new char [1024];
523                                                                 
524                                                                         fileB.read (memblock, 1024);
525                                                                         string temp = memblock;
526                                                                         outFile << temp.substr(0, 1024);
527                                                                         
528                                                                         delete memblock;
529                                                                 }
530                                                                 
531                                                                 memblock = new char [lastRead];
532                                                                 
533                                                                 fileB.read (memblock, lastRead);
534                                                                 
535                                                                 //not sure why but it will read more than lastRead char...??
536                                                                 string temp = memblock;
537                                                                 outFile << temp.substr(0, lastRead);
538                                                                         
539                                                                 delete memblock;
540                                                                 
541                                                                 fileB.close();*/
542                                 m->appendFiles(fileName2, fileName);
543                                                                 m->mothurRemove(fileName2);
544                                                                 
545                                                                 //write out the merged memory
546                                                                 if (numOutputs[groupID] > 60) {
547                                     ofstream tempOut;
548                                     m->openOutputFile(fileName, tempOut);
549                                                                         tempOut << outputs[groupID];
550                                                                         outputs[groupID] = "";
551                                                                         numOutputs[groupID] = 0;
552                                     tempOut.close();
553                                                                 }
554                                                                 
555                                                                 //outFile.close();
556                                                                 
557                                                                 wroteOutPut[groupID] = true;
558                                                                 wroteOutPut[groupIDA] = false;
559                                                         }else { } //just merge memory
560                                                 }                                       
561                                         }
562                                 }
563                         }
564                         m->gobble(dFile);
565                 }
566                 dFile.close();
567         
568                 vector<string> tempDistFiles;
569                 for (int i = 0; i < numGroups; i++) {
570             string fileName = distFile + "." + toString(i) + ".temp";
571             tempDistFiles.push_back(fileName);
572             //remove old names files just in case
573                         
574                         if (numOutputs[i] > 0) {
575                 ofstream outFile;
576                                 outFile.open(fileName.c_str(), ios::app);
577                                 outFile << outputs[i];
578                                 outFile.close();
579                         }
580                 }
581         
582         map<string, int> seqGroup;
583         for (int i = 0; i < groups.size(); i++) {
584             for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
585                 seqGroup[*itNames] = i;
586                 groups[i].erase(itNames++);
587             }
588         }
589         
590                 splitNames(seqGroup, numGroups, tempDistFiles);
591                                 
592                 return 0;                       
593         }
594         catch(exception& e) {
595                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
596                 exit(1);
597         }
598 }
599 //********************************************************************************************************************
600 int SplitMatrix::splitNames(map<string, int>& seqGroup, int numGroups, vector<string>& tempDistFiles){
601         try {
602         ofstream outFile;
603         map<string, int>::iterator it;
604         
605         string inputFile = namefile;
606         if (countfile != "") { inputFile = countfile; }
607         
608         for(int i=0;i<numGroups;i++){  m->mothurRemove((inputFile + "." + toString(i) + ".temp")); }
609
610         singleton = inputFile + ".extra.temp";
611         ofstream remainingNames;
612         m->openOutputFile(singleton, remainingNames);
613         
614         bool wroteExtra = false;
615         
616         ifstream bigNameFile;
617         m->openInputFile(inputFile, bigNameFile);
618         
619         //grab header line 
620         string headers = "";
621         if (countfile != "") { headers = m->getline(bigNameFile); m->gobble(bigNameFile); }
622         
623         string name, nameList;
624         while(!bigNameFile.eof()){
625             bigNameFile >> name >> nameList;  
626             m->getline(bigNameFile); m->gobble(bigNameFile); //extra getline is for rest of countfile line if groups are given.
627             
628             //did this sequence get assigned a group
629             it = seqGroup.find(name);
630             
631             if (it != seqGroup.end()) {  
632                 m->openOutputFileAppend((inputFile + "." + toString(it->second) + ".temp"), outFile);
633                 outFile << name << '\t' << nameList << endl;
634                 outFile.close();
635             }else{
636                 wroteExtra = true;
637                 remainingNames << name << '\t' << nameList << endl;
638             }
639         }
640         bigNameFile.close();
641         
642                 for(int i=0;i<numGroups;i++){
643                         string tempNameFile = inputFile + "." + toString(i) + ".temp";
644                         string tempDistFile = tempDistFiles[i];
645             
646             //if there are valid distances
647             ifstream fileHandle;
648             fileHandle.open(tempDistFile.c_str());
649             if(fileHandle)      {       
650                 m->gobble(fileHandle);
651                 if (!fileHandle.eof()) {  //check
652                                 map<string, string> temp;
653                 if (countfile != "") {
654                     //add header
655                     ofstream out;
656                     string newtempNameFile = tempNameFile + "2";
657                     m->openOutputFile(newtempNameFile, out);
658                     out << "Representative_Sequence\ttotal" << endl;
659                     out.close();
660                     m->appendFiles(tempNameFile, newtempNameFile);
661                     m->mothurRemove(tempNameFile);
662                     m->renameFile(newtempNameFile, tempNameFile);
663                 }
664                                 temp[tempDistFile] = tempNameFile;
665                                 dists.push_back(temp);
666                         }else{
667                                 ifstream in;
668                                 m->openInputFile(tempNameFile, in);
669                                 
670                                 while(!in.eof()) { 
671                                         in >> name >> nameList;  m->gobble(in);
672                                         wroteExtra = true;
673                                         remainingNames << name << '\t' << nameList << endl;
674                                 }
675                                 in.close();
676                                 m->mothurRemove(tempNameFile);
677                         }
678             }
679             fileHandle.close();
680                 }
681                 
682                 remainingNames.close();
683                 
684                 if (!wroteExtra) { 
685                         m->mothurRemove(singleton);
686                         singleton = "none";
687                 }else if (countfile != "") {
688             //add header
689             ofstream out;
690             string newtempNameFile = singleton + "2";
691             m->openOutputFile(newtempNameFile, out);
692             out << "Representative_Sequence\ttotal" << endl; 
693             out.close();
694             m->appendFiles(singleton, newtempNameFile);
695             m->mothurRemove(singleton);
696             m->renameFile(newtempNameFile, singleton);
697         }
698                 
699                 return 0;
700         }
701         catch(exception& e) {
702                 m->errorOut(e, "SplitMatrix", "splitNames");
703                 exit(1);
704         }
705 }
706 //********************************************************************************************************************
707 int SplitMatrix::splitDistanceRAM(){
708         try {
709                 vector<set<string> > groups;
710                 vector<string> outputs;
711                 
712                 int numGroups = 0;
713
714                 ifstream dFile;
715                 m->openInputFile(distFile, dFile);
716
717                 while(dFile){
718                         string seqA, seqB;
719                         float dist;
720
721                         dFile >> seqA >> seqB >> dist;
722                         
723                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
724                                         
725                         if(dist < cutoff){
726                                 //cout << "in cutoff: " << dist << endl;
727                                 int groupIDA = -1;
728                                 int groupIDB = -1;
729                                 int groupID = -1;
730                                 
731                                 for(int i=0;i<numGroups;i++){
732                                         set<string>::iterator aIt = groups[i].find(seqA);
733                                         set<string>::iterator bIt = groups[i].find(seqB);
734                                         
735                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
736                                                 groups[i].insert(seqB);
737                                                 groupIDA = i;
738                                                 groupID = groupIDA;
739
740                                                 //cout << "in aIt: " << groupID << endl;
741         //                                      break;
742                                         }
743                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
744                                                 groups[i].insert(seqA);
745                                                 groupIDB = i;
746                                                 groupID = groupIDB;
747
748                                         //      cout << "in bIt: " << groupID << endl;
749         //                                      break;
750                                         }
751                                 
752                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
753                                                 if(groupIDA < groupIDB){
754                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
755                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
756                                                         groups[groupIDB].clear(); 
757                                                         groupID = groupIDA;
758                                                 }
759                                                 else{
760                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
761                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
762                                                         groups[groupIDA].clear();  
763                                                         groupID = groupIDB;
764                                                 }
765                                                 break;
766                                         }
767                                 }
768                                 
769         //windows is gonna gag on the reuse of outFile, will need to make it local...
770                                 
771                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
772                                         set<string> newGroup;
773                                         newGroup.insert(seqA);
774                                         newGroup.insert(seqB);
775                                         groups.push_back(newGroup);
776                                                                         
777                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
778                                         outputs.push_back(tempOut);
779                                         numGroups++;
780                                 }
781                                 else{
782                                                                                         
783                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
784                                         
785                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
786                                                 string row, column, distance;
787                                                 if(groupIDA<groupIDB){
788                                                         //merge memory
789                                                         outputs[groupID] += outputs[groupIDB];
790                                                         outputs[groupIDB] = "";
791                                                 }else{
792                                                         outputs[groupID] += outputs[groupIDA];
793                                                         outputs[groupIDA] = "";
794                                                 }                                       
795                                         }
796                                 }
797                         }
798                         m->gobble(dFile);
799                 }
800                 dFile.close();
801                 
802         vector<string> tempDistFiles;
803                 for (int i = 0; i < numGroups; i++) {
804             string fileName = distFile + "." + toString(i) + ".temp";
805             tempDistFiles.push_back(fileName);
806                         if (outputs[i] != "") {
807                                 ofstream outFile;
808                                 outFile.open(fileName.c_str(), ios::ate);
809                                 outFile << outputs[i];
810                                 outFile.close();
811                         }
812                 }
813         
814         map<string, int> seqGroup;
815         for (int i = 0; i < groups.size(); i++) {
816             for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
817                 seqGroup[*itNames] = i;
818                 groups[i].erase(itNames++);
819             }
820         }
821         
822                 splitNames(seqGroup, numGroups, tempDistFiles);
823                                 
824                 return 0;                       
825         }
826         catch(exception& e) {
827                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
828                 exit(1);
829         }
830 }
831 //********************************************************************************************************************
832 //sorts biggest to smallest
833 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
834         
835         FILE * pFile;
836         long leftsize = 0;
837                 
838         //get num bytes in file
839         string filename = left.begin()->first;
840         pFile = fopen (filename.c_str(),"rb");
841         string error = "Error opening " + filename;
842         if (pFile==NULL) perror (error.c_str());
843         else{
844                 fseek (pFile, 0, SEEK_END);
845                 leftsize=ftell (pFile);
846                 fclose (pFile);
847         }
848
849         FILE * pFile2;
850         long rightsize = 0;
851                 
852         //get num bytes in file
853         filename = right.begin()->first;
854         pFile2 = fopen (filename.c_str(),"rb");
855         error = "Error opening " + filename;
856         if (pFile2==NULL) perror (error.c_str());
857         else{
858                 fseek (pFile2, 0, SEEK_END);
859                 rightsize=ftell (pFile2);
860                 fclose (pFile2);
861         }
862
863         return (leftsize > rightsize);  
864
865 /***********************************************************************/
866 //returns map of distance files -> namefile sorted by distance file size
867 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
868         try {   
869                 
870                 sort(dists.begin(), dists.end(), compareFileSizes);
871                 
872                 return dists;
873         }
874         catch(exception& e) {
875                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
876                 exit(1);
877         }
878 }
879 /***********************************************************************/
880 SplitMatrix::~SplitMatrix(){}
881 /***********************************************************************/
882