]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
sffinfo bug with flow grams right index when clipQualRight=0
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
14
15 /***********************************************************************/
16
17 SplitMatrix::SplitMatrix(string distfile, string name, string count, string tax, float c, string t, bool l){
18         m = MothurOut::getInstance();
19         distFile = distfile;
20         cutoff = c;
21         namefile = name;
22         method = t;
23         taxFile = tax;
24     countfile = count;
25         large = l;
26 }
27 /***********************************************************************/
28
29 SplitMatrix::SplitMatrix(string ffile, string name, string count, string tax, float c, float cu, string t, int p, bool cl, string output){
30         m = MothurOut::getInstance();
31         fastafile = ffile;
32         namefile = name;
33     countfile = count;
34         taxFile = tax;
35         cutoff = c;  //tax level cutoff
36         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
37         method = t;
38         processors = p;
39     classic = cl;
40         outputDir = output;
41 }
42
43 /***********************************************************************/
44
45 int SplitMatrix::split(){
46         try {
47         
48                 if (method == "distance") {  
49                         splitDistance();
50                 }else if ((method == "classify") || (method == "fasta")) {
51                         splitClassify();
52                 }else {
53                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
54                         map<string, string> temp;
55                         if (namefile != "") {  temp[distFile] = namefile; }
56             else { temp[distFile] = countfile; }
57                         dists.push_back(temp);
58                 }
59                 
60                 return 0;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "SplitMatrix", "split");
64                 exit(1);
65         }
66 }
67 /***********************************************************************/
68 int SplitMatrix::splitDistance(){
69         try {
70         
71                 if (large)      { splitDistanceLarge(); }
72                 else            { splitDistanceRAM();   }
73                 
74                 return 0;
75                         
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "SplitMatrix", "splitDistance");
79                 exit(1);
80         }
81 }
82
83 /***********************************************************************/
84 int SplitMatrix::splitClassify(){
85         try {
86                 cutoff = int(cutoff);
87                                 
88                 map<string, int> seqGroup;
89                 map<string, int>::iterator it;
90                 map<string, int>::iterator it2;
91                 
92                 int numGroups = 0;
93                 
94                 //build tree from users taxonomy file
95                 PhyloTree* phylo = new PhyloTree();
96                 
97         map<string, string> temp;
98         m->readTax(taxFile, temp);
99         
100         for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
101             phylo->addSeqToTree(itTemp->first, itTemp->second);
102             temp.erase(itTemp++);
103         }
104                 
105                 phylo->assignHeirarchyIDs(0);
106
107                 //make sure the cutoff is not greater than maxlevel
108                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
109         
110                 //for each node in tree
111                 for (int i = 0; i < phylo->getNumNodes(); i++) {
112                 
113                         //is this node within the cutoff
114                         TaxNode taxon = phylo->get(i);
115         
116                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
117                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
118                                         for (int j = 0; j < taxon.accessions.size(); j++) {
119                                                 seqGroup[taxon.accessions[j]] = numGroups;
120                                         }
121                                         numGroups++;
122                                 }
123                         }
124                 }
125         
126                 delete phylo;
127                 
128                 if (method == "classify") {
129                         splitDistanceFileByTax(seqGroup, numGroups);
130                 }else {
131                         createDistanceFilesFromTax(seqGroup, numGroups);
132                 }
133                 
134                 return 0;
135                         
136         }
137         catch(exception& e) {
138                 m->errorOut(e, "SplitMatrix", "splitClassify");
139                 exit(1);
140         }
141 }
142 /***********************************************************************/
143 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
144         try {
145                 map<string, int> copyGroups = seqGroup;
146                 map<string, int>::iterator it;
147                 set<string> names;
148                                 
149                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
150                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
151                 }
152                         
153                 ifstream in;
154                 m->openInputFile(fastafile, in);
155         
156                 //parse fastafile
157                 ofstream outFile;
158                 while (!in.eof()) {
159                         Sequence query(in); m->gobble(in);
160                         if (query.getName() != "") {
161                 
162                                 it = seqGroup.find(query.getName());
163                                 
164                                 //save names in case no namefile is given
165                                 if ((namefile == "") && (countfile == "")) {  names.insert(query.getName()); }
166                         
167                                 if (it != seqGroup.end()) { //not singleton 
168                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
169                                         query.printSequence(outFile); 
170                                         outFile.close();
171                                         
172                                         copyGroups.erase(query.getName());
173                                 }
174                         }
175                 }
176                 in.close();
177                 
178                 //warn about sequence in groups that are not in fasta file
179                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
180                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
181                         exit(1);
182                 }
183                 
184                 copyGroups.clear();
185                 
186                 //process each distance file
187                 for (int i = 0; i < numGroups; i++) { 
188                         
189                         string options = "";
190             if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
191             else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
192                         if (outputDir != "") { options += ", outputdir=" + outputDir; }
193                         
194             m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
195             
196                         Command* command = new DistanceCommand(options);
197                         
198             m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
199             
200                         command->execute();
201                         delete command;
202                         
203                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
204                         
205                         //remove old names files just in case
206                         if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); }
207             else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); }
208                 }
209         
210         vector<string> tempDistFiles;    
211         for(int i=0;i<numGroups;i++){
212             if (outputDir == "") { outputDir = m->hasPath(fastafile); }
213             string tempDistFile = "";
214             if (classic) { tempDistFile =  outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
215             else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
216             tempDistFiles.push_back(tempDistFile);
217         }
218         
219         splitNames(seqGroup, numGroups, tempDistFiles);
220         
221                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
222                 
223                 return 0;
224         }
225         catch(exception& e) {
226                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
227                 exit(1);
228         }
229 }
230 /***********************************************************************/
231 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
232         try {
233                 map<string, int>::iterator it;
234                 map<string, int>::iterator it2;
235                 
236         ofstream outFile;
237                 ifstream dFile;
238                 m->openInputFile(distFile, dFile);
239                 
240                 
241                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
242                         m->mothurRemove((distFile + "." + toString(i) + ".temp"));
243                 }
244                 
245                 //for buffering the io to improve speed
246                  //allow for 10 dists to be stored, then output.
247                 vector<string> outputs;  outputs.resize(numGroups, "");
248                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
249                 
250                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
251                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
252                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
253                 
254                 //for each distance
255                 while(dFile){
256                         string seqA, seqB;
257                         float dist;
258                         
259                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp"));       } }
260                         
261                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
262                         
263                         //if both sequences are in the same group then they are within the cutoff
264                         it = seqGroup.find(seqA);
265                         it2 = seqGroup.find(seqB);
266                         
267                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
268                                 if (it->second == it2->second) { //they are from the same group so add the distance
269                                         if (numOutputs[it->second] > 30) {
270                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
271                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
272                                                 outFile.close();
273                                                 outputs[it->second] = "";
274                                                 numOutputs[it->second] = 0;
275                                                 validDistances[it->second] = true;
276                                         }else{
277                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
278                                                 numOutputs[it->second]++;
279                                         }
280                                 }
281                         }
282                 }
283                 dFile.close();
284         
285         string inputFile = namefile;
286         if (countfile != "") { inputFile = countfile; }
287         
288         vector<string> tempDistFiles;
289                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
290             string tempDistFile = distFile + "." + toString(i) + ".temp";
291             tempDistFiles.push_back(tempDistFile);
292                         m->mothurRemove((inputFile + "." + toString(i) + ".temp"));
293                         
294                         //write out any remaining buffers
295                         if (numOutputs[i] > 0) {
296                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
297                                 outFile << outputs[i];
298                                 outFile.close();
299                                 outputs[i] = "";
300                                 numOutputs[i] = 0;
301                                 validDistances[i] = true;
302                         }
303                 }
304                 
305         splitNames(seqGroup, numGroups, tempDistFiles);
306         
307                 if (m->control_pressed)  {  
308                         for (int i = 0; i < dists.size(); i++) { 
309                                 m->mothurRemove((dists[i].begin()->first));
310                                 m->mothurRemove((dists[i].begin()->second));
311                         }
312                         dists.clear();
313                 }
314                 
315                 return 0;
316         }
317         catch(exception& e) {
318                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
319                 exit(1);
320         }
321 }
322 /***********************************************************************/
323 int SplitMatrix::splitDistanceLarge(){
324         try {
325                 vector<set<string> > groups;
326                 
327                 //for buffering the io to improve speed
328                  //allow for 30 dists to be stored, then output.
329                 vector<string> outputs;
330                 vector<int> numOutputs;
331                 vector<bool> wroteOutPut;
332                 
333                 int numGroups = 0;
334
335                 ofstream outFile;
336                 ifstream dFile;
337                 m->openInputFile(distFile, dFile);
338         
339                 while(dFile){
340                         string seqA, seqB;
341                         float dist;
342
343                         dFile >> seqA >> seqB >> dist;
344                         
345                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
346                                         
347                         if(dist < cutoff){
348                                 //cout << "in cutoff: " << dist << endl;
349                                 int groupIDA = -1;
350                                 int groupIDB = -1;
351                                 int groupID = -1;
352                                 
353                                 for(int i=0;i<numGroups;i++){
354                                         set<string>::iterator aIt = groups[i].find(seqA);
355                                         set<string>::iterator bIt = groups[i].find(seqB);
356                                         
357                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
358                                                 groups[i].insert(seqB);
359                                                 groupIDA = i;
360                                                 groupID = groupIDA;
361
362                                                 //cout << "in aIt: " << groupID << endl;
363         //                                      break;
364                                         }
365                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
366                                                 groups[i].insert(seqA);
367                                                 groupIDB = i;
368                                                 groupID = groupIDB;
369
370                                         //      cout << "in bIt: " << groupID << endl;
371         //                                      break;
372                                         }
373                                 
374                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
375                                                 if(groupIDA < groupIDB){
376                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
377                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
378                                                         groups[groupIDB].clear(); 
379                                                         groupID = groupIDA;
380                                                 }
381                                                 else{
382                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
383                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
384                                                         groups[groupIDA].clear();  
385                                                         groupID = groupIDB;
386                                                 }
387                                                 break;
388                                         }
389                                 }
390                                 
391         //windows is gonna gag on the reuse of outFile, will need to make it local...
392                                 
393                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
394                                         set<string> newGroup;
395                                         newGroup.insert(seqA);
396                                         newGroup.insert(seqB);
397                                         groups.push_back(newGroup);
398                                                                         
399                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
400                                         outputs.push_back(tempOut);
401                                         numOutputs.push_back(1);
402                                         wroteOutPut.push_back(false);
403                                         
404                                         numGroups++;
405                                 }
406                                 else{
407                                         string fileName = distFile + "." + toString(groupID) + ".temp";
408                                                                                         
409                                         //have we reached the max buffer size
410                                         if (numOutputs[groupID] > 60) { //write out sequence
411                                                 outFile.open(fileName.c_str(), ios::app);
412                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
413                                                 outFile.close();
414                                                 
415                                                 outputs[groupID] = "";
416                                                 numOutputs[groupID] = 0;
417                                                 wroteOutPut[groupID] = true;
418                                         }else {
419                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
420                                                 numOutputs[groupID]++;
421                                         }
422                                         
423                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
424                                                 string row, column, distance;
425                                                 if(groupIDA<groupIDB){
426                                                         
427                                                         //merge memory
428                                                         numOutputs[groupID] += numOutputs[groupIDB];
429                                                         outputs[groupID] += outputs[groupIDB];
430                                                         
431                                                         outputs[groupIDB] = "";
432                                                         numOutputs[groupIDB] = 0;
433                                                         
434                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
435                                                         if (wroteOutPut[groupIDB]) {
436                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
437                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
438                                                                 
439                                                                 outFile.open(fileName.c_str(), ios::app);
440                                                                 
441                                                                 long size;
442                                                                 char* memblock;
443
444                                                                 size = fileB.tellg();
445                                 
446                                                                 fileB.seekg (0, ios::beg);
447                                                                 
448                                                                 int numRead = size / 1024;
449                                                                 int lastRead = size % 1024;
450
451                                                                 for (int i = 0; i < numRead; i++) {
452                                 
453                                                                         memblock = new char [1024];
454                                                                 
455                                                                         fileB.read (memblock, 1024);
456                                                                         
457                                                                         string temp = memblock;
458                                                                         outFile << temp.substr(0, 1024);
459                                                                         
460                                                                         delete memblock;
461                                                                 }
462                                                                 
463                                                                 memblock = new char [lastRead];
464                                                                 
465                                                                 fileB.read (memblock, lastRead);
466                                                                 
467                                                                 //not sure why but it will read more than lastRead char...??
468                                                                 string temp = memblock;
469                                                                 outFile << temp.substr(0, lastRead);
470                                                                 delete memblock;
471                                                                 
472                                                                 fileB.close();
473                                                                 m->mothurRemove(fileName2);
474                                                                 
475                                                                 //write out the merged memory
476                                                                 if (numOutputs[groupID] > 60) {
477                                                                         outFile << outputs[groupID];
478                                                                         outputs[groupID] = "";
479                                                                         numOutputs[groupID] = 0;
480                                                                 }
481                                                                 
482                                                                 outFile.close();
483                                                                 
484                                                                 wroteOutPut[groupID] = true;
485                                                                 wroteOutPut[groupIDB] = false;
486                                                         }else{ } //just merge b's memory with a's memory 
487                                                 }
488                                                 else{
489                                                         numOutputs[groupID] += numOutputs[groupIDA];
490                                                         outputs[groupID] += outputs[groupIDA];
491                                                         
492                                                         outputs[groupIDA] = "";
493                                                         numOutputs[groupIDA] = 0;
494                                                         
495                                                         if (wroteOutPut[groupIDA]) {
496                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
497                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
498                                                                 
499                                                                 outFile.open(fileName.c_str(), ios::app);
500                                                                 
501                                                                 long size;
502                                                                 char* memblock;
503
504                                                                 size = fileB.tellg();
505                                                                                                                         
506                                                                 fileB.seekg (0, ios::beg);
507                                                                 
508                                                                 int numRead = size / 1024;
509                                                                 int lastRead = size % 1024;
510
511                                                                 for (int i = 0; i < numRead; i++) {
512                                 
513                                                                         memblock = new char [1024];
514                                                                 
515                                                                         fileB.read (memblock, 1024);
516                                                                         string temp = memblock;
517                                                                         outFile << temp.substr(0, 1024);
518                                                                         
519                                                                         delete memblock;
520                                                                 }
521                                                                 
522                                                                 memblock = new char [lastRead];
523                                                                 
524                                                                 fileB.read (memblock, lastRead);
525                                                                 
526                                                                 //not sure why but it will read more than lastRead char...??
527                                                                 string temp = memblock;
528                                                                 outFile << temp.substr(0, lastRead);
529                                                                         
530                                                                 delete memblock;
531                                                                 
532                                                                 fileB.close();
533                                                                 m->mothurRemove(fileName2);
534                                                                 
535                                                                 //write out the merged memory
536                                                                 if (numOutputs[groupID] > 60) {
537                                                                         outFile << outputs[groupID];
538                                                                         outputs[groupID] = "";
539                                                                         numOutputs[groupID] = 0;
540                                                                 }
541                                                                 
542                                                                 outFile.close();
543                                                                 
544                                                                 wroteOutPut[groupID] = true;
545                                                                 wroteOutPut[groupIDA] = false;
546                                                         }else { } //just merge memory
547                                                 }                                       
548                                         }
549                                 }
550                         }
551                         m->gobble(dFile);
552                 }
553                 dFile.close();
554         
555                 vector<string> tempDistFiles;
556                 for (int i = 0; i < numGroups; i++) {
557             string fileName = distFile + "." + toString(i) + ".temp";
558             tempDistFiles.push_back(fileName);
559             //remove old names files just in case
560                         
561                         if (numOutputs[i] > 0) {
562                                 outFile.open(fileName.c_str(), ios::app);
563                                 outFile << outputs[i];
564                                 outFile.close();
565                         }
566                 }
567         
568         map<string, int> seqGroup;
569         for (int i = 0; i < groups.size(); i++) {
570             for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
571                 seqGroup[*itNames] = i;
572                 groups[i].erase(itNames++);
573             }
574         }
575         
576                 splitNames(seqGroup, numGroups, tempDistFiles);
577                                 
578                 return 0;                       
579         }
580         catch(exception& e) {
581                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
582                 exit(1);
583         }
584 }
585 //********************************************************************************************************************
586 int SplitMatrix::splitNames(map<string, int>& seqGroup, int numGroups, vector<string>& tempDistFiles){
587         try {
588         ofstream outFile;
589         map<string, int>::iterator it;
590         
591         string inputFile = namefile;
592         if (countfile != "") { inputFile = countfile; }
593         
594         for(int i=0;i<numGroups;i++){  m->mothurRemove((inputFile + "." + toString(i) + ".temp")); }
595
596         singleton = inputFile + ".extra.temp";
597         ofstream remainingNames;
598         m->openOutputFile(singleton, remainingNames);
599         
600         bool wroteExtra = false;
601         
602         ifstream bigNameFile;
603         m->openInputFile(inputFile, bigNameFile);
604         
605         //grab header line 
606         string headers = "";
607         if (countfile != "") { headers = m->getline(bigNameFile); m->gobble(bigNameFile); }
608         
609         string name, nameList;
610         while(!bigNameFile.eof()){
611             bigNameFile >> name >> nameList;  
612             m->getline(bigNameFile); m->gobble(bigNameFile); //extra getline is for rest of countfile line if groups are given.
613             
614             //did this sequence get assigned a group
615             it = seqGroup.find(name);
616             
617             if (it != seqGroup.end()) {  
618                 m->openOutputFileAppend((inputFile + "." + toString(it->second) + ".temp"), outFile);
619                 outFile << name << '\t' << nameList << endl;
620                 outFile.close();
621             }else{
622                 wroteExtra = true;
623                 remainingNames << name << '\t' << nameList << endl;
624             }
625         }
626         bigNameFile.close();
627         
628                 for(int i=0;i<numGroups;i++){
629                         string tempNameFile = inputFile + "." + toString(i) + ".temp";
630                         string tempDistFile = tempDistFiles[i];
631             
632             //if there are valid distances
633             ifstream fileHandle;
634             fileHandle.open(tempDistFile.c_str());
635             if(fileHandle)      {       
636                 m->gobble(fileHandle);
637                 if (!fileHandle.eof()) {  //check
638                                 map<string, string> temp;
639                 if (countfile != "") {
640                     //add header
641                     ofstream out;
642                     string newtempNameFile = tempNameFile + "2";
643                     m->openOutputFile(newtempNameFile, out);
644                     out << "Representative_Sequence\ttotal" << endl;
645                     out.close();
646                     m->appendFiles(tempNameFile, newtempNameFile);
647                     m->mothurRemove(tempNameFile);
648                     m->renameFile(newtempNameFile, tempNameFile);
649                 }
650                                 temp[tempDistFile] = tempNameFile;
651                                 dists.push_back(temp);
652                         }else{
653                                 ifstream in;
654                                 m->openInputFile(tempNameFile, in);
655                                 
656                                 while(!in.eof()) { 
657                                         in >> name >> nameList;  m->gobble(in);
658                                         wroteExtra = true;
659                                         remainingNames << name << '\t' << nameList << endl;
660                                 }
661                                 in.close();
662                                 m->mothurRemove(tempNameFile);
663                         }
664             }
665             fileHandle.close();
666                 }
667                 
668                 remainingNames.close();
669                 
670                 if (!wroteExtra) { 
671                         m->mothurRemove(singleton);
672                         singleton = "none";
673                 }else if (countfile != "") {
674             //add header
675             ofstream out;
676             string newtempNameFile = singleton + "2";
677             m->openOutputFile(newtempNameFile, out);
678             out << "Representative_Sequence\ttotal" << endl; 
679             out.close();
680             m->appendFiles(singleton, newtempNameFile);
681             m->mothurRemove(singleton);
682             m->renameFile(newtempNameFile, singleton);
683         }
684                 
685                 return 0;
686         }
687         catch(exception& e) {
688                 m->errorOut(e, "SplitMatrix", "splitNames");
689                 exit(1);
690         }
691 }
692 //********************************************************************************************************************
693 int SplitMatrix::splitDistanceRAM(){
694         try {
695                 vector<set<string> > groups;
696                 vector<string> outputs;
697                 
698                 int numGroups = 0;
699
700                 ifstream dFile;
701                 m->openInputFile(distFile, dFile);
702
703                 while(dFile){
704                         string seqA, seqB;
705                         float dist;
706
707                         dFile >> seqA >> seqB >> dist;
708                         
709                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
710                                         
711                         if(dist < cutoff){
712                                 //cout << "in cutoff: " << dist << endl;
713                                 int groupIDA = -1;
714                                 int groupIDB = -1;
715                                 int groupID = -1;
716                                 
717                                 for(int i=0;i<numGroups;i++){
718                                         set<string>::iterator aIt = groups[i].find(seqA);
719                                         set<string>::iterator bIt = groups[i].find(seqB);
720                                         
721                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
722                                                 groups[i].insert(seqB);
723                                                 groupIDA = i;
724                                                 groupID = groupIDA;
725
726                                                 //cout << "in aIt: " << groupID << endl;
727         //                                      break;
728                                         }
729                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
730                                                 groups[i].insert(seqA);
731                                                 groupIDB = i;
732                                                 groupID = groupIDB;
733
734                                         //      cout << "in bIt: " << groupID << endl;
735         //                                      break;
736                                         }
737                                 
738                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
739                                                 if(groupIDA < groupIDB){
740                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
741                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
742                                                         groups[groupIDB].clear(); 
743                                                         groupID = groupIDA;
744                                                 }
745                                                 else{
746                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
747                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
748                                                         groups[groupIDA].clear();  
749                                                         groupID = groupIDB;
750                                                 }
751                                                 break;
752                                         }
753                                 }
754                                 
755         //windows is gonna gag on the reuse of outFile, will need to make it local...
756                                 
757                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
758                                         set<string> newGroup;
759                                         newGroup.insert(seqA);
760                                         newGroup.insert(seqB);
761                                         groups.push_back(newGroup);
762                                                                         
763                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
764                                         outputs.push_back(tempOut);
765                                         numGroups++;
766                                 }
767                                 else{
768                                                                                         
769                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
770                                         
771                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
772                                                 string row, column, distance;
773                                                 if(groupIDA<groupIDB){
774                                                         //merge memory
775                                                         outputs[groupID] += outputs[groupIDB];
776                                                         outputs[groupIDB] = "";
777                                                 }else{
778                                                         outputs[groupID] += outputs[groupIDA];
779                                                         outputs[groupIDA] = "";
780                                                 }                                       
781                                         }
782                                 }
783                         }
784                         m->gobble(dFile);
785                 }
786                 dFile.close();
787                 
788         vector<string> tempDistFiles;
789                 for (int i = 0; i < numGroups; i++) {
790             string fileName = distFile + "." + toString(i) + ".temp";
791             tempDistFiles.push_back(fileName);
792                         if (outputs[i] != "") {
793                                 ofstream outFile;
794                                 outFile.open(fileName.c_str(), ios::ate);
795                                 outFile << outputs[i];
796                                 outFile.close();
797                         }
798                 }
799         
800         map<string, int> seqGroup;
801         for (int i = 0; i < groups.size(); i++) {
802             for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
803                 seqGroup[*itNames] = i;
804                 groups[i].erase(itNames++);
805             }
806         }
807         
808                 splitNames(seqGroup, numGroups, tempDistFiles);
809                                 
810                 return 0;                       
811         }
812         catch(exception& e) {
813                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
814                 exit(1);
815         }
816 }
817 //********************************************************************************************************************
818 //sorts biggest to smallest
819 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
820         
821         FILE * pFile;
822         long leftsize = 0;
823                 
824         //get num bytes in file
825         string filename = left.begin()->first;
826         pFile = fopen (filename.c_str(),"rb");
827         string error = "Error opening " + filename;
828         if (pFile==NULL) perror (error.c_str());
829         else{
830                 fseek (pFile, 0, SEEK_END);
831                 leftsize=ftell (pFile);
832                 fclose (pFile);
833         }
834
835         FILE * pFile2;
836         long rightsize = 0;
837                 
838         //get num bytes in file
839         filename = right.begin()->first;
840         pFile2 = fopen (filename.c_str(),"rb");
841         error = "Error opening " + filename;
842         if (pFile2==NULL) perror (error.c_str());
843         else{
844                 fseek (pFile2, 0, SEEK_END);
845                 rightsize=ftell (pFile2);
846                 fclose (pFile2);
847         }
848
849         return (leftsize > rightsize);  
850
851 /***********************************************************************/
852 //returns map of distance files -> namefile sorted by distance file size
853 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
854         try {   
855                 
856                 sort(dists.begin(), dists.end(), compareFileSizes);
857                 
858                 return dists;
859         }
860         catch(exception& e) {
861                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
862                 exit(1);
863         }
864 }
865 /***********************************************************************/
866 SplitMatrix::~SplitMatrix(){}
867 /***********************************************************************/
868