]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
added load.logfile command. changed summary.single output for subsample=t.
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12 #include "distancecommand.h"
13 #include "seqsummarycommand.h"
14
15 /***********************************************************************/
16
17 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
18         m = MothurOut::getInstance();
19         distFile = distfile;
20         cutoff = c;
21         namefile = name;
22         method = t;
23         taxFile = tax;
24         large = l;
25 }
26 /***********************************************************************/
27
28 SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, bool cl, string output){
29         m = MothurOut::getInstance();
30         fastafile = ffile;
31         namefile = name;
32         taxFile = tax;
33         cutoff = c;  //tax level cutoff
34         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
35         method = t;
36         processors = p;
37     classic = cl;
38         outputDir = output;
39 }
40
41 /***********************************************************************/
42
43 int SplitMatrix::split(){
44         try {
45         
46                 if (method == "distance") {  
47                         splitDistance();
48                 }else if ((method == "classify") || (method == "fasta")) {
49                         splitClassify();
50                 }else {
51                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
52                         map<string, string> temp;
53                         temp[distFile] = namefile;
54                         dists.push_back(temp);
55                 }
56                 
57                 return 0;
58         }
59         catch(exception& e) {
60                 m->errorOut(e, "SplitMatrix", "split");
61                 exit(1);
62         }
63 }
64 /***********************************************************************/
65 int SplitMatrix::splitDistance(){
66         try {
67         
68                 if (large)      { splitDistanceLarge(); }
69                 else            { splitDistanceRAM();   }
70                 
71                 return 0;
72                         
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "SplitMatrix", "splitDistance");
76                 exit(1);
77         }
78 }
79
80 /***********************************************************************/
81 int SplitMatrix::splitClassify(){
82         try {
83                 cutoff = int(cutoff);
84                                 
85                 map<string, int> seqGroup;
86                 map<string, int>::iterator it;
87                 map<string, int>::iterator it2;
88                 
89                 int numGroups = 0;
90                 
91                 //build tree from users taxonomy file
92                 PhyloTree* phylo = new PhyloTree();
93                 
94         map<string, string> temp;
95         m->readTax(taxFile, temp);
96         
97         for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
98             phylo->addSeqToTree(itTemp->first, itTemp->second);
99             temp.erase(itTemp++);
100         }
101                 
102                 phylo->assignHeirarchyIDs(0);
103
104                 //make sure the cutoff is not greater than maxlevel
105                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
106         
107                 //for each node in tree
108                 for (int i = 0; i < phylo->getNumNodes(); i++) {
109                 
110                         //is this node within the cutoff
111                         TaxNode taxon = phylo->get(i);
112         
113                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
114                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
115                                         for (int j = 0; j < taxon.accessions.size(); j++) {
116                                                 seqGroup[taxon.accessions[j]] = numGroups;
117                                         }
118                                         numGroups++;
119                                 }
120                         }
121                 }
122         
123                 delete phylo;
124                 
125                 if (method == "classify") {
126                         splitDistanceFileByTax(seqGroup, numGroups);
127                 }else {
128                         createDistanceFilesFromTax(seqGroup, numGroups);
129                 }
130                 
131                 return 0;
132                         
133         }
134         catch(exception& e) {
135                 m->errorOut(e, "SplitMatrix", "splitClassify");
136                 exit(1);
137         }
138 }
139 /***********************************************************************/
140 int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
141         try {
142                 map<string, int> copyGroups = seqGroup;
143                 map<string, int>::iterator it;
144                 set<string> names;
145                                 
146                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
147                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
148                 }
149                         
150                 ifstream in;
151                 m->openInputFile(fastafile, in);
152         
153                 //parse fastafile
154                 ofstream outFile;
155                 while (!in.eof()) {
156                         Sequence query(in); m->gobble(in);
157                         if (query.getName() != "") {
158                 
159                                 it = seqGroup.find(query.getName());
160                                 
161                                 //save names in case no namefile is given
162                                 if (namefile == "") {  names.insert(query.getName()); }
163                         
164                                 if (it != seqGroup.end()) { //not singleton 
165                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
166                                         query.printSequence(outFile); 
167                                         outFile.close();
168                                         
169                                         copyGroups.erase(query.getName());
170                                 }
171                         }
172                 }
173                 in.close();
174                 
175                 //warn about sequence in groups that are not in fasta file
176                 for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
177                         m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
178                         exit(1);
179                 }
180                 
181                 copyGroups.clear();
182                 
183                 //process each distance file
184                 for (int i = 0; i < numGroups; i++) { 
185                         
186                         string options = "";
187             if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
188             else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
189                         if (outputDir != "") { options += ", outputdir=" + outputDir; }
190                         
191                         Command* command = new DistanceCommand(options);
192                         
193                         command->execute();
194                         delete command;
195                         
196                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
197                         
198                         //remove old names files just in case
199                         m->mothurRemove((namefile + "." + toString(i) + ".temp"));
200                 }
201                         
202                 singleton = namefile + ".extra.temp";
203                 ofstream remainingNames;
204                 m->openOutputFile(singleton, remainingNames);
205                 
206                 bool wroteExtra = false;
207
208                 ifstream bigNameFile;
209                 m->openInputFile(namefile, bigNameFile);
210                 
211                 string name, nameList;
212                 while(!bigNameFile.eof()){
213                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
214                         
215                         //did this sequence get assigned a group
216                         it = seqGroup.find(name);
217                         
218                         if (it != seqGroup.end()) {  
219                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
220                                 outFile << name << '\t' << nameList << endl;
221                                 outFile.close();
222                         }else{
223                                 wroteExtra = true;
224                                 remainingNames << name << '\t' << nameList << endl;
225                         }
226                 }
227                 bigNameFile.close();
228                 
229                 for(int i=0;i<numGroups;i++){
230                         string tempNameFile = namefile + "." + toString(i) + ".temp";
231                         if (outputDir == "") { outputDir = m->hasPath(fastafile); }
232                         string tempDistFile = "";
233             if (classic) { tempDistFile =  outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
234             else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
235
236                         //if there are valid distances
237                         ifstream fileHandle;
238                         fileHandle.open(tempDistFile.c_str());
239                         if(fileHandle)  {       
240                                 m->gobble(fileHandle);
241                                 if (!fileHandle.eof()) {  //check for blank file - this could occur if all dists in group are above cutoff
242                                         map<string, string> temp;
243                                         temp[tempDistFile] = tempNameFile;
244                                         dists.push_back(temp);
245                                 }else {
246                                         ifstream in;
247                                         m->openInputFile(tempNameFile, in);
248                                 
249                                         while(!in.eof()) { 
250                                                 in >> name >> nameList;  m->gobble(in);
251                                                 wroteExtra = true;
252                                                 remainingNames << name << '\t' << nameList << endl;
253                                         }
254                                         in.close();
255                                         m->mothurRemove(tempNameFile);
256                                 }
257                         }
258                         fileHandle.close();
259                 }
260                 
261                 remainingNames.close();
262                 if (!wroteExtra) { 
263                         m->mothurRemove(singleton);
264                         singleton = "none";
265                 }
266
267                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
268                 
269                 return 0;
270         }
271         catch(exception& e) {
272                 m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
273                 exit(1);
274         }
275 }
276 /***********************************************************************/
277 int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
278         try {
279                 map<string, int>::iterator it;
280                 map<string, int>::iterator it2;
281                 
282                 ifstream dFile;
283                 m->openInputFile(distFile, dFile);
284                 ofstream outFile;
285                 
286                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
287                         m->mothurRemove((distFile + "." + toString(i) + ".temp"));
288                 }
289                 
290                 //for buffering the io to improve speed
291                  //allow for 10 dists to be stored, then output.
292                 vector<string> outputs;  outputs.resize(numGroups, "");
293                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
294                 
295                 //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
296                 //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
297                 vector<bool> validDistances;   validDistances.resize(numGroups, false); 
298                 
299                 //for each distance
300                 while(dFile){
301                         string seqA, seqB;
302                         float dist;
303                         
304                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp"));       } }
305                         
306                         dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
307                         
308                         //if both sequences are in the same group then they are within the cutoff
309                         it = seqGroup.find(seqA);
310                         it2 = seqGroup.find(seqB);
311                         
312                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
313                                 if (it->second == it2->second) { //they are from the same group so add the distance
314                                         if (numOutputs[it->second] > 30) {
315                                                 m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
316                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
317                                                 outFile.close();
318                                                 outputs[it->second] = "";
319                                                 numOutputs[it->second] = 0;
320                                                 validDistances[it->second] = true;
321                                         }else{
322                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
323                                                 numOutputs[it->second]++;
324                                         }
325                                 }
326                         }
327                 }
328                 dFile.close();
329         
330                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
331                         m->mothurRemove((namefile + "." + toString(i) + ".temp"));
332                         
333                         //write out any remaining buffers
334                         if (numOutputs[i] > 0) {
335                                 m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
336                                 outFile << outputs[i];
337                                 outFile.close();
338                                 outputs[i] = "";
339                                 numOutputs[i] = 0;
340                                 validDistances[i] = true;
341                         }
342                 }
343                 
344                 ifstream bigNameFile;
345                 m->openInputFile(namefile, bigNameFile);
346                 
347                 singleton = namefile + ".extra.temp";
348                 ofstream remainingNames;
349                 m->openOutputFile(singleton, remainingNames);
350                 
351                 bool wroteExtra = false;
352                                                 
353                 string name, nameList;
354                 while(!bigNameFile.eof()){
355                         bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
356                         
357                         //did this sequence get assigned a group
358                         it = seqGroup.find(name);
359                         
360                         if (it != seqGroup.end()) {  
361                                 m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
362                                 outFile << name << '\t' << nameList << endl;
363                                 outFile.close();
364                         }else{
365                                 wroteExtra = true;
366                                 remainingNames << name << '\t' << nameList << endl;
367                         }
368                 }
369                 bigNameFile.close();
370                                 
371                 for(int i=0;i<numGroups;i++){
372                         string tempNameFile = namefile + "." + toString(i) + ".temp";
373                         string tempDistFile = distFile + "." + toString(i) + ".temp";
374
375                         //if there are valid distances
376                         if (validDistances[i]) {
377                                 map<string, string> temp;
378                                 temp[tempDistFile] = tempNameFile;
379                                 dists.push_back(temp);
380                         }else{
381                                 ifstream in;
382                                 m->openInputFile(tempNameFile, in);
383                                 
384                                 while(!in.eof()) { 
385                                         in >> name >> nameList;  m->gobble(in);
386                                         wroteExtra = true;
387                                         remainingNames << name << '\t' << nameList << endl;
388                                 }
389                                 in.close();
390                                 m->mothurRemove(tempNameFile);
391                         }
392                 }
393                 
394                 remainingNames.close();
395                 
396                 if (!wroteExtra) { 
397                         m->mothurRemove(singleton);
398                         singleton = "none";
399                 }
400
401                 if (m->control_pressed)  {  
402                         for (int i = 0; i < dists.size(); i++) { 
403                                 m->mothurRemove((dists[i].begin()->first));
404                                 m->mothurRemove((dists[i].begin()->second));
405                         }
406                         dists.clear();
407                 }
408                 
409                 return 0;
410         }
411         catch(exception& e) {
412                 m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
413                 exit(1);
414         }
415 }
416 /***********************************************************************/
417 int SplitMatrix::splitDistanceLarge(){
418         try {
419                 vector<set<string> > groups;
420                 
421                 //for buffering the io to improve speed
422                  //allow for 30 dists to be stored, then output.
423                 vector<string> outputs;
424                 vector<int> numOutputs;
425                 vector<bool> wroteOutPut;
426                 
427                 int numGroups = 0;
428
429                 ofstream outFile;
430                 ifstream dFile;
431                 m->openInputFile(distFile, dFile);
432         
433                 while(dFile){
434                         string seqA, seqB;
435                         float dist;
436
437                         dFile >> seqA >> seqB >> dist;
438                         
439                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
440                                         
441                         if(dist < cutoff){
442                                 //cout << "in cutoff: " << dist << endl;
443                                 int groupIDA = -1;
444                                 int groupIDB = -1;
445                                 int groupID = -1;
446                                 
447                                 for(int i=0;i<numGroups;i++){
448                                         set<string>::iterator aIt = groups[i].find(seqA);
449                                         set<string>::iterator bIt = groups[i].find(seqB);
450                                         
451                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
452                                                 groups[i].insert(seqB);
453                                                 groupIDA = i;
454                                                 groupID = groupIDA;
455
456                                                 //cout << "in aIt: " << groupID << endl;
457         //                                      break;
458                                         }
459                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
460                                                 groups[i].insert(seqA);
461                                                 groupIDB = i;
462                                                 groupID = groupIDB;
463
464                                         //      cout << "in bIt: " << groupID << endl;
465         //                                      break;
466                                         }
467                                 
468                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
469                                                 if(groupIDA < groupIDB){
470                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
471                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
472                                                         groups[groupIDB].clear(); 
473                                                         groupID = groupIDA;
474                                                 }
475                                                 else{
476                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
477                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
478                                                         groups[groupIDA].clear();  
479                                                         groupID = groupIDB;
480                                                 }
481                                                 break;
482                                         }
483                                 }
484                                 
485         //windows is gonna gag on the reuse of outFile, will need to make it local...
486                                 
487                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
488                                         set<string> newGroup;
489                                         newGroup.insert(seqA);
490                                         newGroup.insert(seqB);
491                                         groups.push_back(newGroup);
492                                                                         
493                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
494                                         outputs.push_back(tempOut);
495                                         numOutputs.push_back(1);
496                                         wroteOutPut.push_back(false);
497                                         
498                                         numGroups++;
499                                 }
500                                 else{
501                                         string fileName = distFile + "." + toString(groupID) + ".temp";
502                                                                                         
503                                         //have we reached the max buffer size
504                                         if (numOutputs[groupID] > 60) { //write out sequence
505                                                 outFile.open(fileName.c_str(), ios::app);
506                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
507                                                 outFile.close();
508                                                 
509                                                 outputs[groupID] = "";
510                                                 numOutputs[groupID] = 0;
511                                                 wroteOutPut[groupID] = true;
512                                         }else {
513                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
514                                                 numOutputs[groupID]++;
515                                         }
516                                         
517                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
518                                                 string row, column, distance;
519                                                 if(groupIDA<groupIDB){
520                                                         
521                                                         //merge memory
522                                                         numOutputs[groupID] += numOutputs[groupIDB];
523                                                         outputs[groupID] += outputs[groupIDB];
524                                                         
525                                                         outputs[groupIDB] = "";
526                                                         numOutputs[groupIDB] = 0;
527                                                         
528                                                         //if groupB is written to file it is above buffer size so read and write to new merged file
529                                                         if (wroteOutPut[groupIDB]) {
530                                                                 string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
531                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
532                                                                 
533                                                                 outFile.open(fileName.c_str(), ios::app);
534                                                                 
535                                                                 long size;
536                                                                 char* memblock;
537
538                                                                 size = fileB.tellg();
539                                 
540                                                                 fileB.seekg (0, ios::beg);
541                                                                 
542                                                                 int numRead = size / 1024;
543                                                                 int lastRead = size % 1024;
544
545                                                                 for (int i = 0; i < numRead; i++) {
546                                 
547                                                                         memblock = new char [1024];
548                                                                 
549                                                                         fileB.read (memblock, 1024);
550                                                                         
551                                                                         string temp = memblock;
552                                                                         outFile << temp.substr(0, 1024);
553                                                                         
554                                                                         delete memblock;
555                                                                 }
556                                                                 
557                                                                 memblock = new char [lastRead];
558                                                                 
559                                                                 fileB.read (memblock, lastRead);
560                                                                 
561                                                                 //not sure why but it will read more than lastRead char...??
562                                                                 string temp = memblock;
563                                                                 outFile << temp.substr(0, lastRead);
564                                                                 delete memblock;
565                                                                 
566                                                                 fileB.close();
567                                                                 m->mothurRemove(fileName2);
568                                                                 
569                                                                 //write out the merged memory
570                                                                 if (numOutputs[groupID] > 60) {
571                                                                         outFile << outputs[groupID];
572                                                                         outputs[groupID] = "";
573                                                                         numOutputs[groupID] = 0;
574                                                                 }
575                                                                 
576                                                                 outFile.close();
577                                                                 
578                                                                 wroteOutPut[groupID] = true;
579                                                                 wroteOutPut[groupIDB] = false;
580                                                         }else{ } //just merge b's memory with a's memory 
581                                                 }
582                                                 else{
583                                                         numOutputs[groupID] += numOutputs[groupIDA];
584                                                         outputs[groupID] += outputs[groupIDA];
585                                                         
586                                                         outputs[groupIDA] = "";
587                                                         numOutputs[groupIDA] = 0;
588                                                         
589                                                         if (wroteOutPut[groupIDA]) {
590                                                                 string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
591                                                                 ifstream fileB(fileName2.c_str(), ios::ate);
592                                                                 
593                                                                 outFile.open(fileName.c_str(), ios::app);
594                                                                 
595                                                                 long size;
596                                                                 char* memblock;
597
598                                                                 size = fileB.tellg();
599                                                                                                                         
600                                                                 fileB.seekg (0, ios::beg);
601                                                                 
602                                                                 int numRead = size / 1024;
603                                                                 int lastRead = size % 1024;
604
605                                                                 for (int i = 0; i < numRead; i++) {
606                                 
607                                                                         memblock = new char [1024];
608                                                                 
609                                                                         fileB.read (memblock, 1024);
610                                                                         string temp = memblock;
611                                                                         outFile << temp.substr(0, 1024);
612                                                                         
613                                                                         delete memblock;
614                                                                 }
615                                                                 
616                                                                 memblock = new char [lastRead];
617                                                                 
618                                                                 fileB.read (memblock, lastRead);
619                                                                 
620                                                                 //not sure why but it will read more than lastRead char...??
621                                                                 string temp = memblock;
622                                                                 outFile << temp.substr(0, lastRead);
623                                                                         
624                                                                 delete memblock;
625                                                                 
626                                                                 fileB.close();
627                                                                 m->mothurRemove(fileName2);
628                                                                 
629                                                                 //write out the merged memory
630                                                                 if (numOutputs[groupID] > 60) {
631                                                                         outFile << outputs[groupID];
632                                                                         outputs[groupID] = "";
633                                                                         numOutputs[groupID] = 0;
634                                                                 }
635                                                                 
636                                                                 outFile.close();
637                                                                 
638                                                                 wroteOutPut[groupID] = true;
639                                                                 wroteOutPut[groupIDA] = false;
640                                                         }else { } //just merge memory
641                                                 }                                       
642                                         }
643                                 }
644                         }
645                         m->gobble(dFile);
646                 }
647                 dFile.close();
648                 
649                 for (int i = 0; i < numGroups; i++) {
650                         if (numOutputs[i] > 0) {
651                                 string fileName = distFile + "." + toString(i) + ".temp";
652                                 outFile.open(fileName.c_str(), ios::app);
653                                 outFile << outputs[i];
654                                 outFile.close();
655                         }
656                 }
657
658                 splitNames(groups);
659                                 
660                 return 0;                       
661         }
662         catch(exception& e) {
663                 m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
664                 exit(1);
665         }
666 }
667 //********************************************************************************************************************
668 int SplitMatrix::splitNames(vector<set<string> >& groups){
669         try {
670                 int numGroups = groups.size();
671         
672                 ifstream bigNameFile(namefile.c_str());
673                 if(!bigNameFile){
674                         cerr << "Error: We can't open the name file\n";
675                         exit(1);
676                 }
677                 
678                 map<string, string> nameMap;
679                 string name, nameList;
680                 while(bigNameFile){
681                         bigNameFile >> name >> nameList;
682                         nameMap[name] = nameList;
683                         m->gobble(bigNameFile);
684                 }
685                 bigNameFile.close();
686                         
687                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
688                         int numSeqsInGroup = groups[i].size();
689                         
690                         if(numSeqsInGroup > 0){
691                                 string fileName = namefile + "." + toString(i) + ".temp";
692                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
693                                 
694                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
695                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
696                                         if (nIt != nameMap.end()) {
697                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
698                                                 nameMap.erase(nIt);
699                                         }else{
700                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
701                                         }
702                                 }
703                                 smallNameFile.close();
704                         }
705                 }
706                 
707                 //names of singletons
708                 if (nameMap.size() != 0) {
709                         singleton = namefile + ".extra.temp";
710                         ofstream remainingNames(singleton.c_str(), ios::ate);
711                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
712                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
713                         }
714                         remainingNames.close();
715                 }else { singleton = "none"; }
716                         
717                 for(int i=0;i<numGroups;i++){
718                         if(groups[i].size() > 0){
719                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
720                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
721                                 
722                                 map<string, string> temp;
723                                 temp[tempDistFile] = tempNameFile;
724                                 dists.push_back(temp);
725                         }
726                 }
727                 
728                 if (m->control_pressed)  {  
729                         for (int i = 0; i < dists.size(); i++) { 
730                                 m->mothurRemove((dists[i].begin()->first));
731                                 m->mothurRemove((dists[i].begin()->second));
732                         }
733                         dists.clear();
734                 }
735                 
736                 return 0;
737         }
738         catch(exception& e) {
739                 m->errorOut(e, "SplitMatrix", "splitNames");
740                 exit(1);
741         }
742 }
743 //********************************************************************************************************************
744 int SplitMatrix::splitDistanceRAM(){
745         try {
746                 vector<set<string> > groups;
747                 vector<string> outputs;
748                 
749                 int numGroups = 0;
750
751                 ifstream dFile;
752                 m->openInputFile(distFile, dFile);
753
754                 while(dFile){
755                         string seqA, seqB;
756                         float dist;
757
758                         dFile >> seqA >> seqB >> dist;
759                         
760                         if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
761                                         
762                         if(dist < cutoff){
763                                 //cout << "in cutoff: " << dist << endl;
764                                 int groupIDA = -1;
765                                 int groupIDB = -1;
766                                 int groupID = -1;
767                                 
768                                 for(int i=0;i<numGroups;i++){
769                                         set<string>::iterator aIt = groups[i].find(seqA);
770                                         set<string>::iterator bIt = groups[i].find(seqB);
771                                         
772                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
773                                                 groups[i].insert(seqB);
774                                                 groupIDA = i;
775                                                 groupID = groupIDA;
776
777                                                 //cout << "in aIt: " << groupID << endl;
778         //                                      break;
779                                         }
780                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
781                                                 groups[i].insert(seqA);
782                                                 groupIDB = i;
783                                                 groupID = groupIDB;
784
785                                         //      cout << "in bIt: " << groupID << endl;
786         //                                      break;
787                                         }
788                                 
789                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
790                                                 if(groupIDA < groupIDB){
791                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
792                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
793                                                         groups[groupIDB].clear(); 
794                                                         groupID = groupIDA;
795                                                 }
796                                                 else{
797                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
798                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
799                                                         groups[groupIDA].clear();  
800                                                         groupID = groupIDB;
801                                                 }
802                                                 break;
803                                         }
804                                 }
805                                 
806         //windows is gonna gag on the reuse of outFile, will need to make it local...
807                                 
808                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
809                                         set<string> newGroup;
810                                         newGroup.insert(seqA);
811                                         newGroup.insert(seqB);
812                                         groups.push_back(newGroup);
813                                                                         
814                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
815                                         outputs.push_back(tempOut);
816                                         numGroups++;
817                                 }
818                                 else{
819                                                                                         
820                                         outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
821                                         
822                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
823                                                 string row, column, distance;
824                                                 if(groupIDA<groupIDB){
825                                                         //merge memory
826                                                         outputs[groupID] += outputs[groupIDB];
827                                                         outputs[groupIDB] = "";
828                                                 }else{
829                                                         outputs[groupID] += outputs[groupIDA];
830                                                         outputs[groupIDA] = "";
831                                                 }                                       
832                                         }
833                                 }
834                         }
835                         m->gobble(dFile);
836                 }
837                 dFile.close();
838                 
839                 for (int i = 0; i < numGroups; i++) {
840                         if (outputs[i] != "") {
841                                 ofstream outFile;
842                                 string fileName = distFile + "." + toString(i) + ".temp";
843                                 outFile.open(fileName.c_str(), ios::ate);
844                                 outFile << outputs[i];
845                                 outFile.close();
846                         }
847                 }
848
849                 splitNames(groups);
850                                 
851                 return 0;                       
852         }
853         catch(exception& e) {
854                 m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
855                 exit(1);
856         }
857 }
858 //********************************************************************************************************************
859 //sorts biggest to smallest
860 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
861         
862         FILE * pFile;
863         long leftsize = 0;
864                 
865         //get num bytes in file
866         string filename = left.begin()->first;
867         pFile = fopen (filename.c_str(),"rb");
868         string error = "Error opening " + filename;
869         if (pFile==NULL) perror (error.c_str());
870         else{
871                 fseek (pFile, 0, SEEK_END);
872                 leftsize=ftell (pFile);
873                 fclose (pFile);
874         }
875
876         FILE * pFile2;
877         long rightsize = 0;
878                 
879         //get num bytes in file
880         filename = right.begin()->first;
881         pFile2 = fopen (filename.c_str(),"rb");
882         error = "Error opening " + filename;
883         if (pFile2==NULL) perror (error.c_str());
884         else{
885                 fseek (pFile2, 0, SEEK_END);
886                 rightsize=ftell (pFile2);
887                 fclose (pFile2);
888         }
889
890         return (leftsize > rightsize);  
891
892 /***********************************************************************/
893 //returns map of distance files -> namefile sorted by distance file size
894 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
895         try {   
896                 
897                 sort(dists.begin(), dists.end(), compareFileSizes);
898                 
899                 return dists;
900         }
901         catch(exception& e) {
902                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
903                 exit(1);
904         }
905 }
906 /***********************************************************************/
907 SplitMatrix::~SplitMatrix(){}
908 /***********************************************************************/
909