]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
added cluster.split command
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11
12 /***********************************************************************/
13
14 SplitMatrix::SplitMatrix(string distfile, string name, float c){
15         m = MothurOut::getInstance();
16         distFile = distfile;
17         cutoff = c;
18         namefile = name;
19 }
20
21 /***********************************************************************/
22
23 int SplitMatrix::split(){
24         try {
25         
26                 vector<set<string> > groups;
27                 int numGroups = 0;
28
29                 ofstream outFile;
30                 ifstream dFile;
31                 openInputFile(distFile, dFile);
32         
33                 while(dFile){
34                         string seqA, seqB;
35                         float dist;
36
37                         dFile >> seqA >> seqB >> dist;
38                         
39                         if(dist < cutoff){
40                                 //cout << "in cutoff: " << dist << endl;
41                                 int groupIDA = -1;
42                                 int groupIDB = -1;
43                                 int groupID = -1;
44                                 int prevGroupID = -1;
45                                 
46                                 for(int i=0;i<numGroups;i++){
47                                         set<string>::iterator aIt = groups[i].find(seqA);
48                                         set<string>::iterator bIt = groups[i].find(seqB);
49                                         
50                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
51                                                 groups[i].insert(seqB);
52                                                 groupIDA = i;
53                                                 groupID = groupIDA;
54
55                                                 //cout << "in aIt: " << groupID << endl;
56         //                                      break;
57                                         }
58                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
59                                                 groups[i].insert(seqA);
60                                                 groupIDB = i;
61                                                 groupID = groupIDB;
62
63                                         //      cout << "in bIt: " << groupID << endl;
64         //                                      break;
65                                         }
66                                 
67                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
68                                                 if(groupIDA < groupIDB){
69                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
70                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
71                                                         groups[groupIDB].clear(); 
72                                                         groupID = groupIDA;
73                                                 }
74                                                 else{
75                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
76                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
77                                                         groups[groupIDA].clear();  
78                                                         groupID = groupIDB;
79                                                 }
80                                                 break;
81                                         }
82                                 }
83                                 
84         //windows is gonna gag on the reuse of outFile, will need to make it local...
85                                 
86                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
87                                         set<string> newGroup;
88                                         newGroup.insert(seqA);
89                                         newGroup.insert(seqB);
90                                         groups.push_back(newGroup);
91                                                                         
92                                         outFile.close();
93                                         string fileName = distFile + "." + toString(numGroups) + ".temp";
94                                         outFile.open(fileName.c_str(), ios::ate);
95
96                                         outFile << seqA << '\t' << seqB << '\t' << dist << endl;
97                                         numGroups++;
98                                 }
99                                 else{
100                                         string fileName = distFile + "." + toString(groupID) + ".temp";
101                                         if(groupID != prevGroupID){
102                                                 outFile.close();
103                                                 outFile.open(fileName.c_str(), ios::app);
104                                                 prevGroupID     = groupID;
105                                         }
106                                         outFile << seqA << '\t' << seqB << '\t' << dist << endl;
107                                         
108                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
109                                                 string row, column, distance;
110                                                 if(groupIDA<groupIDB){
111                                                         string fileName = distFile + "." + toString(groupIDB) + ".temp";
112                                                         ifstream fileB(fileName.c_str());
113                                                         while(fileB){
114                                                                 fileB >> row >> column >> distance;
115                                                                 outFile << row << '\t' << column << '\t' << distance << endl;
116                                                                 gobble(fileB);
117                                                         }
118                                                         fileB.close();
119                                                         remove(fileName.c_str());
120                                                 }
121                                                 else{
122                                                         string fileName = distFile + "." + toString(groupIDA) + ".temp";
123                                                         ifstream fileA(fileName.c_str());
124                                                         while(fileA){
125                                                                 fileA >> row >> column >> distance;
126                                                                 outFile << row << '\t' << column << '\t' << distance << endl;
127                                                                 gobble(fileA);
128                                                         }
129                                                         fileA.close();
130                                                         remove(fileName.c_str());
131                                                 }                                       
132                                         }
133                                 }
134                         }
135                         gobble(dFile);
136                 }
137                 outFile.close();
138                 dFile.close();
139         
140                 ifstream bigNameFile(namefile.c_str());
141                 if(!bigNameFile){
142                         cerr << "Error: We can't open the name file\n";
143                         exit(1);
144                 }
145                 
146                 map<string, string> nameMap;
147                 string name, nameList;
148                 while(bigNameFile){
149                         bigNameFile >> name >> nameList;
150                         nameMap[name] = nameList;
151                         gobble(bigNameFile);
152                 }
153                 bigNameFile.close();
154                         
155                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
156                         int numSeqsInGroup = groups[i].size();
157                         
158                         if(numSeqsInGroup > 0){
159                                 string fileName = namefile + "." + toString(i) + ".temp";
160                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
161                                 
162                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
163                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
164                                         
165                                         if (nIt != nameMap.end()) {
166                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
167                                                 nameMap.erase(nIt);
168                                         }else{
169                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
170                                         }
171                                 }
172                                 smallNameFile.close();
173                         }
174                 }
175                         
176                 //names of singletons
177                 if (nameMap.size() != 0) {
178                         singleton = namefile + ".extra.temp";
179                         ofstream remainingNames(singleton.c_str(), ios::ate);
180                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
181                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
182                         }
183                         remainingNames.close();
184                 }else { singleton = "none"; }
185                         
186                 for(int i=0;i<numGroups;i++){
187                         if(groups[i].size() > 0){
188                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
189                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
190                                 
191                                 map<string, string> temp;
192                                 temp[tempDistFile] = tempNameFile;
193                                 dists.push_back(temp);
194                         }
195                 }
196                                 
197                 return 0;
198                         
199         }
200         catch(exception& e) {
201                 m->errorOut(e, "SplitMatrix", "split");
202                 exit(1);
203         }
204 }
205 //********************************************************************************************************************
206 //sorts biggest to smallest
207 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
208         
209         FILE * pFile;
210         long leftsize = 0;
211                 
212         //get num bytes in file
213         string filename = left.begin()->first;
214         pFile = fopen (filename.c_str(),"rb");
215         string error = "Error opening " + filename;
216         if (pFile==NULL) perror (error.c_str());
217         else{
218                 fseek (pFile, 0, SEEK_END);
219                 leftsize=ftell (pFile);
220                 fclose (pFile);
221         }
222
223         FILE * pFile2;
224         long rightsize = 0;
225                 
226         //get num bytes in file
227         filename = right.begin()->first;
228         pFile2 = fopen (filename.c_str(),"rb");
229         error = "Error opening " + filename;
230         if (pFile2==NULL) perror (error.c_str());
231         else{
232                 fseek (pFile2, 0, SEEK_END);
233                 rightsize=ftell (pFile2);
234                 fclose (pFile2);
235         }
236
237         return (leftsize > rightsize);  
238
239 /***********************************************************************/
240 //returns map of distance files -> namefile sorted by distance file size
241 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
242         try {   
243                 
244                 sort(dists.begin(), dists.end(), compareFileSizes);
245                 
246                 return dists;
247         }
248         catch(exception& e) {
249                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
250                 exit(1);
251         }
252 }
253 /***********************************************************************/
254 SplitMatrix::~SplitMatrix(){}
255 /***********************************************************************/
256