]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
finished cluster.split adding classify method.
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12
13 /***********************************************************************/
14
15 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){
16         m = MothurOut::getInstance();
17         distFile = distfile;
18         cutoff = c;
19         namefile = name;
20         method = t;
21         taxFile = tax;
22 }
23
24 /***********************************************************************/
25
26 int SplitMatrix::split(){
27         try {
28         
29                 if (method == "distance") {  
30                         splitDistance();
31                 }else if (method == "classify") {
32                         splitClassify();
33                 }else {
34                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
35                         map<string, string> temp;
36                         temp[distFile] = namefile;
37                         dists.push_back(temp);
38                 }
39                 
40                 return 0;
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "SplitMatrix", "split");
44                 exit(1);
45         }
46 }
47 /***********************************************************************/
48 int SplitMatrix::splitDistance(){
49         try {
50         
51                 vector<set<string> > groups;
52                 int numGroups = 0;
53
54                 ofstream outFile;
55                 ifstream dFile;
56                 openInputFile(distFile, dFile);
57         
58                 while(dFile){
59                         string seqA, seqB;
60                         float dist;
61
62                         dFile >> seqA >> seqB >> dist;
63                         
64                         if (m->control_pressed) {  outFile.close(); dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
65                                         
66                         if(dist < cutoff){
67                                 //cout << "in cutoff: " << dist << endl;
68                                 int groupIDA = -1;
69                                 int groupIDB = -1;
70                                 int groupID = -1;
71                                 int prevGroupID = -1;
72                                 
73                                 for(int i=0;i<numGroups;i++){
74                                         set<string>::iterator aIt = groups[i].find(seqA);
75                                         set<string>::iterator bIt = groups[i].find(seqB);
76                                         
77                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
78                                                 groups[i].insert(seqB);
79                                                 groupIDA = i;
80                                                 groupID = groupIDA;
81
82                                                 //cout << "in aIt: " << groupID << endl;
83         //                                      break;
84                                         }
85                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
86                                                 groups[i].insert(seqA);
87                                                 groupIDB = i;
88                                                 groupID = groupIDB;
89
90                                         //      cout << "in bIt: " << groupID << endl;
91         //                                      break;
92                                         }
93                                 
94                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
95                                                 if(groupIDA < groupIDB){
96                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
97                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
98                                                         groups[groupIDB].clear(); 
99                                                         groupID = groupIDA;
100                                                 }
101                                                 else{
102                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
103                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
104                                                         groups[groupIDA].clear();  
105                                                         groupID = groupIDB;
106                                                 }
107                                                 break;
108                                         }
109                                 }
110                                 
111         //windows is gonna gag on the reuse of outFile, will need to make it local...
112                                 
113                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
114                                         set<string> newGroup;
115                                         newGroup.insert(seqA);
116                                         newGroup.insert(seqB);
117                                         groups.push_back(newGroup);
118                                                                         
119                                         outFile.close();
120                                         string fileName = distFile + "." + toString(numGroups) + ".temp";
121                                         outFile.open(fileName.c_str(), ios::ate);
122
123                                         outFile << seqA << '\t' << seqB << '\t' << dist << endl;
124                                         numGroups++;
125                                 }
126                                 else{
127                                         string fileName = distFile + "." + toString(groupID) + ".temp";
128                                         if(groupID != prevGroupID){
129                                                 outFile.close();
130                                                 outFile.open(fileName.c_str(), ios::app);
131                                                 prevGroupID     = groupID;
132                                         }
133                                         outFile << seqA << '\t' << seqB << '\t' << dist << endl;
134                                         
135                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
136                                                 string row, column, distance;
137                                                 if(groupIDA<groupIDB){
138                                                         string fileName = distFile + "." + toString(groupIDB) + ".temp";
139                                                         ifstream fileB(fileName.c_str());
140                                                         while(fileB){
141                                                                 fileB >> row >> column >> distance;
142                                                                 outFile << row << '\t' << column << '\t' << distance << endl;
143                                                                 gobble(fileB);
144                                                         }
145                                                         fileB.close();
146                                                         remove(fileName.c_str());
147                                                 }
148                                                 else{
149                                                         string fileName = distFile + "." + toString(groupIDA) + ".temp";
150                                                         ifstream fileA(fileName.c_str());
151                                                         while(fileA){
152                                                                 fileA >> row >> column >> distance;
153                                                                 outFile << row << '\t' << column << '\t' << distance << endl;
154                                                                 gobble(fileA);
155                                                         }
156                                                         fileA.close();
157                                                         remove(fileName.c_str());
158                                                 }                                       
159                                         }
160                                 }
161                         }
162                         gobble(dFile);
163                 }
164                 outFile.close();
165                 dFile.close();
166         
167                 ifstream bigNameFile(namefile.c_str());
168                 if(!bigNameFile){
169                         cerr << "Error: We can't open the name file\n";
170                         exit(1);
171                 }
172                 
173                 map<string, string> nameMap;
174                 string name, nameList;
175                 while(bigNameFile){
176                         bigNameFile >> name >> nameList;
177                         nameMap[name] = nameList;
178                         gobble(bigNameFile);
179                 }
180                 bigNameFile.close();
181                         
182                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
183                         int numSeqsInGroup = groups[i].size();
184                         
185                         if(numSeqsInGroup > 0){
186                                 string fileName = namefile + "." + toString(i) + ".temp";
187                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
188                                 
189                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
190                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
191                                         
192                                         if (nIt != nameMap.end()) {
193                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
194                                                 nameMap.erase(nIt);
195                                         }else{
196                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
197                                         }
198                                 }
199                                 smallNameFile.close();
200                         }
201                 }
202                 
203                 //names of singletons
204                 if (nameMap.size() != 0) {
205                         singleton = namefile + ".extra.temp";
206                         ofstream remainingNames(singleton.c_str(), ios::ate);
207                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
208                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
209                         }
210                         remainingNames.close();
211                 }else { singleton = "none"; }
212                         
213                 for(int i=0;i<numGroups;i++){
214                         if(groups[i].size() > 0){
215                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
216                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
217                                 
218                                 map<string, string> temp;
219                                 temp[tempDistFile] = tempNameFile;
220                                 dists.push_back(temp);
221                         }
222                 }
223                 
224                 if (m->control_pressed)  {  
225                         for (int i = 0; i < dists.size(); i++) { 
226                                 remove((dists[i].begin()->first).c_str());
227                                 remove((dists[i].begin()->second).c_str());
228                         }
229                         dists.clear();
230                 }
231                 
232                 return 0;
233                         
234         }
235         catch(exception& e) {
236                 m->errorOut(e, "SplitMatrix", "splitDistance");
237                 exit(1);
238         }
239 }
240
241 /***********************************************************************/
242 int SplitMatrix::splitClassify(){
243         try {
244                 cutoff = int(cutoff);
245                 
246                 map<string, int> seqGroup;
247                 map<string, int>::iterator it;
248                 map<string, int>::iterator it2;
249                 
250                 int numGroups = 0;
251                 
252                 //build tree from users taxonomy file
253                 PhyloTree* phylo = new PhyloTree();
254                 
255                 ifstream in;
256                 openInputFile(taxFile, in);
257                         
258                 //read in users taxonomy file and add sequences to tree
259                 string seqname, tax;
260                 while(!in.eof()){
261                         in >> seqname >> tax; gobble(in);
262                                 
263                         phylo->addSeqToTree(seqname, tax);
264                 }
265                 in.close();
266                 
267                 phylo->assignHeirarchyIDs(0);
268
269                 //make sure the cutoff is not greater than maxlevel
270                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
271                 
272                 //for each node in tree
273                 for (int i = 0; i < phylo->getNumNodes(); i++) {
274                 
275                         //is this node within the cutoff
276                         TaxNode taxon = phylo->get(i);
277                 
278                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
279                                 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
280                                         for (int j = 0; j < taxon.accessions.size(); j++) {
281                                                 seqGroup[taxon.accessions[j]] = numGroups;
282                                         }
283                                         numGroups++;
284                                 }
285                         }
286                 }
287
288                 ifstream dFile;
289                 openInputFile(distFile, dFile);
290                 ofstream outFile;
291                 
292                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
293                         remove((distFile + "." + toString(i) + ".temp").c_str());
294                 }
295                 
296                 //for each distance
297                 while(dFile){
298                         string seqA, seqB;
299                         float dist;
300                         
301                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
302                         
303                         dFile >> seqA >> seqB >> dist;  gobble(dFile);
304                         
305                         //if both sequences are in the same group then they are within the cutoff
306                         it = seqGroup.find(seqA);
307                         it2 = seqGroup.find(seqB);
308                         
309                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
310                                 if (it->second == it2->second) { //they are from the same group so add the distance
311                                         openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
312                                         outFile << seqA << '\t' << seqB << '\t' << dist << endl;
313                                         outFile.close();
314                                 }
315                         }
316                 }
317                 dFile.close();
318         
319                 
320                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
321                         remove((namefile + "." + toString(i) + ".temp").c_str());
322                 }
323                 
324                 ifstream bigNameFile;
325                 openInputFile(namefile, bigNameFile);
326                 
327                 singleton = namefile + ".extra.temp";
328                 ofstream remainingNames;
329                 openOutputFile(singleton, remainingNames);
330                 
331                 bool wroteExtra = false;
332                                                 
333                 string name, nameList;
334                 while(!bigNameFile.eof()){
335                         bigNameFile >> name >> nameList;  gobble(bigNameFile);
336                         
337                         //did this sequence get assigned a group
338                         it = seqGroup.find(name);
339                         
340                         if (it != seqGroup.end()) {  
341                                 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
342                                 outFile << name << '\t' << nameList << endl;
343                                 outFile.close();
344                         }else{
345                                 wroteExtra = true;
346                                 remainingNames << name << '\t' << nameList << endl;
347                         }
348                 }
349                 bigNameFile.close();
350                 remainingNames.close();
351                 
352                 if (!wroteExtra) { 
353                         remove(singleton.c_str());
354                         singleton = "none";
355                 }
356                         
357                 for(int i=0;i<numGroups;i++){
358                         string tempNameFile = namefile + "." + toString(i) + ".temp";
359                         string tempDistFile = distFile + "." + toString(i) + ".temp";
360                                 
361                         map<string, string> temp;
362                         temp[tempDistFile] = tempNameFile;
363                         dists.push_back(temp);
364                 }
365                 
366                 if (m->control_pressed)  {  
367                         for (int i = 0; i < dists.size(); i++) { 
368                                 remove((dists[i].begin()->first).c_str());
369                                 remove((dists[i].begin()->second).c_str());
370                         }
371                         dists.clear();
372                 }
373                 
374                 return 0;
375                         
376         }
377         catch(exception& e) {
378                 m->errorOut(e, "SplitMatrix", "splitClassify");
379                 exit(1);
380         }
381 }
382 //********************************************************************************************************************
383 //sorts biggest to smallest
384 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
385         
386         FILE * pFile;
387         long leftsize = 0;
388                 
389         //get num bytes in file
390         string filename = left.begin()->first;
391         pFile = fopen (filename.c_str(),"rb");
392         string error = "Error opening " + filename;
393         if (pFile==NULL) perror (error.c_str());
394         else{
395                 fseek (pFile, 0, SEEK_END);
396                 leftsize=ftell (pFile);
397                 fclose (pFile);
398         }
399
400         FILE * pFile2;
401         long rightsize = 0;
402                 
403         //get num bytes in file
404         filename = right.begin()->first;
405         pFile2 = fopen (filename.c_str(),"rb");
406         error = "Error opening " + filename;
407         if (pFile2==NULL) perror (error.c_str());
408         else{
409                 fseek (pFile2, 0, SEEK_END);
410                 rightsize=ftell (pFile2);
411                 fclose (pFile2);
412         }
413
414         return (leftsize > rightsize);  
415
416 /***********************************************************************/
417 //returns map of distance files -> namefile sorted by distance file size
418 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
419         try {   
420                 
421                 sort(dists.begin(), dists.end(), compareFileSizes);
422                 
423                 return dists;
424         }
425         catch(exception& e) {
426                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
427                 exit(1);
428         }
429 }
430 /***********************************************************************/
431 SplitMatrix::~SplitMatrix(){}
432 /***********************************************************************/
433