]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
sped up splitting of distance file by 2.5 times by buffering the read and writes.
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12
13 /***********************************************************************/
14
15 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){
16         m = MothurOut::getInstance();
17         distFile = distfile;
18         cutoff = c;
19         namefile = name;
20         method = t;
21         taxFile = tax;
22 }
23
24 /***********************************************************************/
25
26 int SplitMatrix::split(){
27         try {
28         
29                 if (method == "distance") {  
30                         splitDistance();
31                 }else if (method == "classify") {
32                         splitClassify();
33                 }else {
34                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
35                         map<string, string> temp;
36                         temp[distFile] = namefile;
37                         dists.push_back(temp);
38                 }
39                 
40                 return 0;
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "SplitMatrix", "split");
44                 exit(1);
45         }
46 }
47 /***********************************************************************/
48 int SplitMatrix::splitDistance(){
49         try {
50         
51                 vector<set<string> > groups;
52                 
53                 //for buffering the io to improve speed
54                  //allow for 10 dists to be stored, then output.
55                 vector<string> outputs;
56                 vector<int> numOutputs;
57                 vector<bool> wroteOutPut;
58                 
59                 int numGroups = 0;
60
61                 ofstream outFile;
62                 ifstream dFile;
63                 openInputFile(distFile, dFile);
64         
65                 while(dFile){
66                         string seqA, seqB;
67                         float dist;
68
69                         dFile >> seqA >> seqB >> dist;
70                         
71                         if (m->control_pressed) {  outFile.close(); dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
72                                         
73                         if(dist < cutoff){
74                                 //cout << "in cutoff: " << dist << endl;
75                                 int groupIDA = -1;
76                                 int groupIDB = -1;
77                                 int groupID = -1;
78                                 int prevGroupID = -1;
79                                 
80                                 for(int i=0;i<numGroups;i++){
81                                         set<string>::iterator aIt = groups[i].find(seqA);
82                                         set<string>::iterator bIt = groups[i].find(seqB);
83                                         
84                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
85                                                 groups[i].insert(seqB);
86                                                 groupIDA = i;
87                                                 groupID = groupIDA;
88
89                                                 //cout << "in aIt: " << groupID << endl;
90         //                                      break;
91                                         }
92                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
93                                                 groups[i].insert(seqA);
94                                                 groupIDB = i;
95                                                 groupID = groupIDB;
96
97                                         //      cout << "in bIt: " << groupID << endl;
98         //                                      break;
99                                         }
100                                 
101                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
102                                                 if(groupIDA < groupIDB){
103                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
104                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
105                                                         groups[groupIDB].clear(); 
106                                                         groupID = groupIDA;
107                                                 }
108                                                 else{
109                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
110                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
111                                                         groups[groupIDA].clear();  
112                                                         groupID = groupIDB;
113                                                 }
114                                                 break;
115                                         }
116                                 }
117                                 
118         //windows is gonna gag on the reuse of outFile, will need to make it local...
119                                 
120                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
121                                         set<string> newGroup;
122                                         newGroup.insert(seqA);
123                                         newGroup.insert(seqB);
124                                         groups.push_back(newGroup);
125                                                                         
126                                         outFile.close();
127                                         string fileName = distFile + "." + toString(numGroups) + ".temp";
128                                         outFile.open(fileName.c_str(), ios::ate);
129
130                                         string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
131                                         outputs.push_back(tempOut);
132                                         numOutputs.push_back(1);
133                                         wroteOutPut.push_back(false);
134                                         
135                                         numGroups++;
136                                 }
137                                 else{
138                                         string fileName = distFile + "." + toString(groupID) + ".temp";
139                                         
140                                         if(groupID != prevGroupID){
141                                                 outFile.close();
142                                                 outFile.open(fileName.c_str(), ios::app);
143                                                 prevGroupID     = groupID;
144                                         }
145                                         
146                                         //have we reached the max buffer size
147                                         if (numOutputs[groupID] > 10) { //write out sequence
148                                                 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
149                                                 outputs[groupID] = "";
150                                                 numOutputs[groupID] = 0;
151                                                 wroteOutPut[groupID] = true;
152                                         }else {
153                                                 outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
154                                                 numOutputs[groupID]++;
155                                         }
156                                         
157                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
158                                                 string row, column, distance;
159                                                 if(groupIDA<groupIDB){
160                         
161                                                         numOutputs[groupID] += numOutputs[groupIDB];
162                                                         outputs[groupID] += outputs[groupIDB];
163                                                         
164                                                         if (wroteOutPut[groupIDB]) {
165                                                                 string fileName = distFile + "." + toString(groupIDB) + ".temp";
166                                                                 ifstream fileB(fileName.c_str(), ios::ate);
167                                                                 
168                                                                 long size;
169                                                                 char* memblock;
170
171                                                                 size = fileB.tellg();
172                                 
173                                                                 fileB.seekg (0, ios::beg);
174                                                                 
175                                                                 int numRead = size / 1024;
176                                                                 int lastRead = size % 1024;
177
178                                                                 for (int i = 0; i < numRead; i++) {
179                                 
180                                                                         memblock = new char [1024];
181                                                                 
182                                                                         fileB.read (memblock, 1024);
183                                                                         
184                                                                         string temp = memblock;
185                                                                         outFile << temp.substr(0, 1024);
186                                                                         
187                                                                         delete memblock;
188                                                                 }
189                                                                 
190                                                                 memblock = new char [lastRead];
191                                                                 
192                                                                 fileB.read (memblock, lastRead);
193                                                                 
194                                                                 //not sure why but it will read more than lastRead char...??
195                                                                 string temp = memblock;
196                                                                 outFile << temp.substr(0, lastRead);
197                                                                 delete memblock;
198                                                                 
199                                                                 fileB.close();
200                                                                 remove(fileName.c_str());
201                                                                 
202                                                                 wroteOutPut[groupID] = true;
203                                                                 wroteOutPut[groupIDB] = false;
204                                                         }
205                                                         
206                                                         if (numOutputs[groupID] != 0) {
207                                                                 outFile << outputs[groupID];
208                                                                 wroteOutPut[groupID] = true;
209                                                                 outputs[groupID] = "";
210                                                                 numOutputs[groupID] = 0;
211                                                                 
212                                                                 outputs[groupIDB] = "";
213                                                                 numOutputs[groupIDB] = 0;
214                                                         }
215                                                         
216                                                 }
217                                                 else{
218                                                         numOutputs[groupID] += numOutputs[groupIDA];
219                                                         outputs[groupID] += outputs[groupIDA];
220                                                         
221                                                         if (wroteOutPut[groupIDA]) {
222                                                                 string fileName = distFile + "." + toString(groupIDA) + ".temp";
223                                                                 ifstream fileB(fileName.c_str(), ios::ate);
224                                                                 
225                                                                 long size;
226                                                                 char* memblock;
227
228                                                                 size = fileB.tellg();
229                                                                                                                         
230                                                                 fileB.seekg (0, ios::beg);
231                                                                 
232                                                                 int numRead = size / 1024;
233                                                                 int lastRead = size % 1024;
234
235                                                                 for (int i = 0; i < numRead; i++) {
236                                 
237                                                                         memblock = new char [1024];
238                                                                 
239                                                                         fileB.read (memblock, 1024);
240                                                                         string temp = memblock;
241                                                                         outFile << temp.substr(0, 1024);
242                                                                         
243                                                                         delete memblock;
244                                                                 }
245                                                                 
246                                                                 memblock = new char [lastRead];
247                                                                 
248                                                                 fileB.read (memblock, lastRead);
249                                                                 
250                                                                 //not sure why but it will read more than lastRead char...??
251                                                                 string temp = memblock;
252                                                                 outFile << temp.substr(0, lastRead);
253                                                                         
254                                                                 delete memblock;
255                                                                 
256                                                                 fileB.close();
257                                                                 remove(fileName.c_str());
258                                                                 
259                                                                 wroteOutPut[groupID] = true;
260                                                                 wroteOutPut[groupIDA] = false;
261                                                         }
262                                                         
263                                                         if (numOutputs[groupID] != 0) {
264                                                                 outFile << outputs[groupID];
265                                                                 wroteOutPut[groupID] = true;
266                                                                 outputs[groupID] = "";
267                                                                 numOutputs[groupID] = 0;
268                                                                 
269                                                                 outputs[groupIDA] = "";
270                                                                 numOutputs[groupIDA] = 0;
271                                                         }
272
273                                                 }                                       
274                                         }
275                                 }
276                         }
277                         gobble(dFile);
278                 }
279                 outFile.close();
280                 dFile.close();
281                 
282                 for (int i = 0; i < numGroups; i++) {
283                         if (numOutputs[i] > 0) {
284                                 string fileName = distFile + "." + toString(i) + ".temp";
285                                 outFile.open(fileName.c_str(), ios::app);
286                                 outFile << outputs[i];
287                                 outFile.close();
288                         }
289                 }
290         
291                 ifstream bigNameFile(namefile.c_str());
292                 if(!bigNameFile){
293                         cerr << "Error: We can't open the name file\n";
294                         exit(1);
295                 }
296                 
297                 map<string, string> nameMap;
298                 string name, nameList;
299                 while(bigNameFile){
300                         bigNameFile >> name >> nameList;
301                         nameMap[name] = nameList;
302                         gobble(bigNameFile);
303                 }
304                 bigNameFile.close();
305                         
306                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
307                         int numSeqsInGroup = groups[i].size();
308                         
309                         if(numSeqsInGroup > 0){
310                                 string fileName = namefile + "." + toString(i) + ".temp";
311                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
312                                 
313                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
314                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
315                                         if (nIt != nameMap.end()) {
316                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
317                                                 nameMap.erase(nIt);
318                                         }else{
319                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
320                                         }
321                                 }
322                                 smallNameFile.close();
323                         }
324                 }
325                 
326                 //names of singletons
327                 if (nameMap.size() != 0) {
328                         singleton = namefile + ".extra.temp";
329                         ofstream remainingNames(singleton.c_str(), ios::ate);
330                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
331                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
332                         }
333                         remainingNames.close();
334                 }else { singleton = "none"; }
335                         
336                 for(int i=0;i<numGroups;i++){
337                         if(groups[i].size() > 0){
338                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
339                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
340                                 
341                                 map<string, string> temp;
342                                 temp[tempDistFile] = tempNameFile;
343                                 dists.push_back(temp);
344                         }
345                 }
346                 
347                 if (m->control_pressed)  {  
348                         for (int i = 0; i < dists.size(); i++) { 
349                                 remove((dists[i].begin()->first).c_str());
350                                 remove((dists[i].begin()->second).c_str());
351                         }
352                         dists.clear();
353                 }
354                 
355                 return 0;
356                         
357         }
358         catch(exception& e) {
359                 m->errorOut(e, "SplitMatrix", "splitDistance");
360                 exit(1);
361         }
362 }
363
364 /***********************************************************************/
365 int SplitMatrix::splitClassify(){
366         try {
367                 cutoff = int(cutoff);
368                 
369                 map<string, int> seqGroup;
370                 map<string, int>::iterator it;
371                 map<string, int>::iterator it2;
372                 
373                 int numGroups = 0;
374                 
375                 //build tree from users taxonomy file
376                 PhyloTree* phylo = new PhyloTree();
377                 
378                 ifstream in;
379                 openInputFile(taxFile, in);
380                         
381                 //read in users taxonomy file and add sequences to tree
382                 string seqname, tax;
383                 while(!in.eof()){
384                         in >> seqname >> tax; gobble(in);
385                                 
386                         phylo->addSeqToTree(seqname, tax);
387                 }
388                 in.close();
389                 
390                 phylo->assignHeirarchyIDs(0);
391
392                 //make sure the cutoff is not greater than maxlevel
393                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
394                 
395                 //for each node in tree
396                 for (int i = 0; i < phylo->getNumNodes(); i++) {
397                 
398                         //is this node within the cutoff
399                         TaxNode taxon = phylo->get(i);
400                 
401                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
402                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
403                                         for (int j = 0; j < taxon.accessions.size(); j++) {
404                                                 seqGroup[taxon.accessions[j]] = numGroups;
405                                         }
406                                         numGroups++;
407                                 }
408                         }
409                 }
410
411                 ifstream dFile;
412                 openInputFile(distFile, dFile);
413                 ofstream outFile;
414                 
415                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
416                         remove((distFile + "." + toString(i) + ".temp").c_str());
417                 }
418                 
419                 
420                 //for buffering the io to improve speed
421                  //allow for 10 dists to be stored, then output.
422                 vector<string> outputs;  outputs.resize(numGroups, "");
423                 vector<int> numOutputs;  numOutputs.resize(numGroups, 0);       
424                 
425                 //for each distance
426                 while(dFile){
427                         string seqA, seqB;
428                         float dist;
429                         
430                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
431                         
432                         dFile >> seqA >> seqB >> dist;  gobble(dFile);
433                         
434                         //if both sequences are in the same group then they are within the cutoff
435                         it = seqGroup.find(seqA);
436                         it2 = seqGroup.find(seqB);
437                         
438                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
439                                 if (it->second == it2->second) { //they are from the same group so add the distance
440                                         if (numOutputs[it->second] > 10) {
441                                                 openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
442                                                 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
443                                                 outFile.close();
444                                                 outputs[it->second] = "";
445                                                 numOutputs[it->second] = 0;
446                                         }else{
447                                                 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
448                                                 numOutputs[it->second]++;
449                                         }
450                                 }
451                         }
452                 }
453                 dFile.close();
454         
455                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
456                         remove((namefile + "." + toString(i) + ".temp").c_str());
457                         
458                         //write out any remaining buffers
459                         if (numOutputs[it->second] > 0) {
460                                 openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
461                                 outFile << outputs[i];
462                                 outFile.close();
463                                 outputs[i] = "";
464                                 numOutputs[i] = 0;
465                         }
466                 }
467                 
468                 ifstream bigNameFile;
469                 openInputFile(namefile, bigNameFile);
470                 
471                 singleton = namefile + ".extra.temp";
472                 ofstream remainingNames;
473                 openOutputFile(singleton, remainingNames);
474                 
475                 bool wroteExtra = false;
476                                                 
477                 string name, nameList;
478                 while(!bigNameFile.eof()){
479                         bigNameFile >> name >> nameList;  gobble(bigNameFile);
480                         
481                         //did this sequence get assigned a group
482                         it = seqGroup.find(name);
483                         
484                         if (it != seqGroup.end()) {  
485                                 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
486                                 outFile << name << '\t' << nameList << endl;
487                                 outFile.close();
488                         }else{
489                                 wroteExtra = true;
490                                 remainingNames << name << '\t' << nameList << endl;
491                         }
492                 }
493                 bigNameFile.close();
494                 remainingNames.close();
495                 
496                 if (!wroteExtra) { 
497                         remove(singleton.c_str());
498                         singleton = "none";
499                 }
500                         
501                 for(int i=0;i<numGroups;i++){
502                         string tempNameFile = namefile + "." + toString(i) + ".temp";
503                         string tempDistFile = distFile + "." + toString(i) + ".temp";
504                                 
505                         map<string, string> temp;
506                         temp[tempDistFile] = tempNameFile;
507                         dists.push_back(temp);
508                 }
509                 
510                 if (m->control_pressed)  {  
511                         for (int i = 0; i < dists.size(); i++) { 
512                                 remove((dists[i].begin()->first).c_str());
513                                 remove((dists[i].begin()->second).c_str());
514                         }
515                         dists.clear();
516                 }
517                 
518                 return 0;
519                         
520         }
521         catch(exception& e) {
522                 m->errorOut(e, "SplitMatrix", "splitClassify");
523                 exit(1);
524         }
525 }
526 //********************************************************************************************************************
527 //sorts biggest to smallest
528 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
529         
530         FILE * pFile;
531         long leftsize = 0;
532                 
533         //get num bytes in file
534         string filename = left.begin()->first;
535         pFile = fopen (filename.c_str(),"rb");
536         string error = "Error opening " + filename;
537         if (pFile==NULL) perror (error.c_str());
538         else{
539                 fseek (pFile, 0, SEEK_END);
540                 leftsize=ftell (pFile);
541                 fclose (pFile);
542         }
543
544         FILE * pFile2;
545         long rightsize = 0;
546                 
547         //get num bytes in file
548         filename = right.begin()->first;
549         pFile2 = fopen (filename.c_str(),"rb");
550         error = "Error opening " + filename;
551         if (pFile2==NULL) perror (error.c_str());
552         else{
553                 fseek (pFile2, 0, SEEK_END);
554                 rightsize=ftell (pFile2);
555                 fclose (pFile2);
556         }
557
558         return (leftsize > rightsize);  
559
560 /***********************************************************************/
561 //returns map of distance files -> namefile sorted by distance file size
562 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
563         try {   
564                 
565                 sort(dists.begin(), dists.end(), compareFileSizes);
566                 
567                 return dists;
568         }
569         catch(exception& e) {
570                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
571                 exit(1);
572         }
573 }
574 /***********************************************************************/
575 SplitMatrix::~SplitMatrix(){}
576 /***********************************************************************/
577