]> git.donarmstrong.com Git - mothur.git/blob - splitmatrix.cpp
changed hard parameter in cluster commands
[mothur.git] / splitmatrix.cpp
1 /*
2  *  splitmatrix.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "splitmatrix.h"
11 #include "phylotree.h"
12
13 /***********************************************************************/
14
15 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){
16         m = MothurOut::getInstance();
17         distFile = distfile;
18         cutoff = c;
19         namefile = name;
20         method = t;
21         taxFile = tax;
22 }
23
24 /***********************************************************************/
25
26 int SplitMatrix::split(){
27         try {
28         
29                 if (method == "distance") {  
30                         splitDistance();
31                 }else if (method == "classify") {
32                         splitClassify();
33                 }else {
34                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
35                         map<string, string> temp;
36                         temp[distFile] = namefile;
37                         dists.push_back(temp);
38                 }
39                 
40                 return 0;
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "SplitMatrix", "split");
44                 exit(1);
45         }
46 }
47 /***********************************************************************/
48 int SplitMatrix::splitDistance(){
49         try {
50         
51                 vector<set<string> > groups;
52                 int numGroups = 0;
53
54                 ofstream outFile;
55                 ifstream dFile;
56                 openInputFile(distFile, dFile);
57         
58                 while(dFile){
59                         string seqA, seqB;
60                         float dist;
61
62                         dFile >> seqA >> seqB >> dist;
63                         
64                         if (m->control_pressed) {  outFile.close(); dFile.close();  for(int i=0;i<numGroups;i++){       if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
65                                         
66                         if(dist < cutoff){
67                                 //cout << "in cutoff: " << dist << endl;
68                                 int groupIDA = -1;
69                                 int groupIDB = -1;
70                                 int groupID = -1;
71                                 int prevGroupID = -1;
72                                 
73                                 for(int i=0;i<numGroups;i++){
74                                         set<string>::iterator aIt = groups[i].find(seqA);
75                                         set<string>::iterator bIt = groups[i].find(seqB);
76                                         
77                                         if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
78                                                 groups[i].insert(seqB);
79                                                 groupIDA = i;
80                                                 groupID = groupIDA;
81
82                                                 //cout << "in aIt: " << groupID << endl;
83         //                                      break;
84                                         }
85                                         else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
86                                                 groups[i].insert(seqA);
87                                                 groupIDB = i;
88                                                 groupID = groupIDB;
89
90                                         //      cout << "in bIt: " << groupID << endl;
91         //                                      break;
92                                         }
93                                 
94                                         if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
95                                                 if(groupIDA < groupIDB){
96                                                 //      cout << "A: " << groupIDA << "\t" << groupIDB << endl;
97                                                         groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
98                                                         groups[groupIDB].clear(); 
99                                                         groupID = groupIDA;
100                                                 }
101                                                 else{
102                                                 //      cout << "B: " << groupIDA << "\t" << groupIDB << endl;
103                                                         groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
104                                                         groups[groupIDA].clear();  
105                                                         groupID = groupIDB;
106                                                 }
107                                                 break;
108                                         }
109                                 }
110                                 
111         //windows is gonna gag on the reuse of outFile, will need to make it local...
112                                 
113                                 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
114                                         set<string> newGroup;
115                                         newGroup.insert(seqA);
116                                         newGroup.insert(seqB);
117                                         groups.push_back(newGroup);
118                                                                         
119                                         outFile.close();
120                                         string fileName = distFile + "." + toString(numGroups) + ".temp";
121                                         outFile.open(fileName.c_str(), ios::ate);
122
123                                         outFile << seqA << '\t' << seqB << '\t' << dist << endl;
124                                         numGroups++;
125                                 }
126                                 else{
127                                         string fileName = distFile + "." + toString(groupID) + ".temp";
128                                         if(groupID != prevGroupID){
129                                                 outFile.close();
130                                                 outFile.open(fileName.c_str(), ios::app);
131                                                 prevGroupID     = groupID;
132                                         }
133                                         outFile << seqA << '\t' << seqB << '\t' << dist << endl;
134                                         
135                                         if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
136                                                 string row, column, distance;
137                                                 if(groupIDA<groupIDB){
138                                                         string fileName = distFile + "." + toString(groupIDB) + ".temp";
139                                                         ifstream fileB(fileName.c_str());
140                                                         while(fileB){
141                                                                 fileB >> row >> column >> distance;
142                                                                 outFile << row << '\t' << column << '\t' << distance << endl;
143                                                                 gobble(fileB);
144                                                         }
145                                                         fileB.close();
146                                                         remove(fileName.c_str());
147                                                 }
148                                                 else{
149                                                         string fileName = distFile + "." + toString(groupIDA) + ".temp";
150                                                         ifstream fileA(fileName.c_str());
151                                                         while(fileA){
152                                                                 fileA >> row >> column >> distance;
153                                                                 outFile << row << '\t' << column << '\t' << distance << endl;
154                                                                 gobble(fileA);
155                                                         }
156                                                         fileA.close();
157                                                         remove(fileName.c_str());
158                                                 }                                       
159                                         }
160                                 }
161                         }
162                         gobble(dFile);
163                 }
164                 outFile.close();
165                 dFile.close();
166         
167                 ifstream bigNameFile(namefile.c_str());
168                 if(!bigNameFile){
169                         cerr << "Error: We can't open the name file\n";
170                         exit(1);
171                 }
172                 
173                 map<string, string> nameMap;
174                 string name, nameList;
175                 while(bigNameFile){
176                         bigNameFile >> name >> nameList;
177                         nameMap[name] = nameList;
178                         gobble(bigNameFile);
179                 }
180                 bigNameFile.close();
181                         
182                 for(int i=0;i<numGroups;i++){  //parse names file to match distance files
183                         int numSeqsInGroup = groups[i].size();
184                         
185                         if(numSeqsInGroup > 0){
186                                 string fileName = namefile + "." + toString(i) + ".temp";
187                                 ofstream smallNameFile(fileName.c_str(), ios::ate);
188                                 
189                                 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
190                                         map<string,string>::iterator nIt = nameMap.find(*gIt);
191                                         
192                                         if (nIt != nameMap.end()) {
193                                                 smallNameFile << nIt->first << '\t' << nIt->second << endl;
194                                                 nameMap.erase(nIt);
195                                         }else{
196                                                 m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
197                                         }
198                                 }
199                                 smallNameFile.close();
200                         }
201                 }
202                 
203                 //names of singletons
204                 if (nameMap.size() != 0) {
205                         singleton = namefile + ".extra.temp";
206                         ofstream remainingNames(singleton.c_str(), ios::ate);
207                         for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
208                                 remainingNames << nIt->first << '\t' << nIt->second << endl;
209                         }
210                         remainingNames.close();
211                 }else { singleton = "none"; }
212                         
213                 for(int i=0;i<numGroups;i++){
214                         if(groups[i].size() > 0){
215                                 string tempNameFile = namefile + "." + toString(i) + ".temp";
216                                 string tempDistFile = distFile + "." + toString(i) + ".temp";
217                                 
218                                 map<string, string> temp;
219                                 temp[tempDistFile] = tempNameFile;
220                                 dists.push_back(temp);
221                         }
222                 }
223                 
224                 if (m->control_pressed)  {  
225                         for (int i = 0; i < dists.size(); i++) { 
226                                 remove((dists[i].begin()->first).c_str());
227                                 remove((dists[i].begin()->second).c_str());
228                         }
229                         dists.clear();
230                 }
231                 
232                 return 0;
233                         
234         }
235         catch(exception& e) {
236                 m->errorOut(e, "SplitMatrix", "splitDistance");
237                 exit(1);
238         }
239 }
240
241 /***********************************************************************/
242 int SplitMatrix::splitClassify(){
243         try {
244                 map<string, int> seqGroup;
245                 map<string, int>::iterator it;
246                 map<string, int>::iterator it2;
247                 
248                 int numGroups = 0;
249                 
250                 //build tree from users taxonomy file
251                 PhyloTree* phylo = new PhyloTree();
252                 
253                 ifstream in;
254                 openInputFile(taxFile, in);
255                         
256                 //read in users taxonomy file and add sequences to tree
257                 string seqname, tax;
258                 while(!in.eof()){
259                         in >> seqname >> tax; gobble(in);
260                                 
261                         phylo->addSeqToTree(seqname, tax);
262                 }
263                 in.close();
264                 
265                 phylo->assignHeirarchyIDs(0);
266
267                 //make sure the cutoff is not greater than maxlevel
268                 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
269                 
270                 //for each node in tree
271                 for (int i = 0; i < phylo->getNumNodes(); i++) {
272                 
273                         //is this node within the cutoff
274                         TaxNode taxon = phylo->get(i);
275                         
276                         if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
277                                 if (taxon.children.size() > 1) { //if this taxon just has one seq its a singleton
278                                         for (it = taxon.children.begin(); it != taxon.children.end(); it++) {
279                                                 seqGroup[it->first] = numGroups;
280                                         }
281                                         numGroups++;
282                                 }
283                         }
284                 }
285
286                 ifstream dFile;
287                 openInputFile(distFile, dFile);
288                 ofstream outFile;
289                 
290                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
291                         remove((distFile + "." + toString(i) + ".temp").c_str());
292                 }
293                 
294                 //for each distance
295                 while(dFile){
296                         string seqA, seqB;
297                         float dist;
298                         
299                         if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());        } }
300                         
301                         dFile >> seqA >> seqB >> dist;  gobble(dFile);
302                         
303                         //if both sequences are in the same group then they are within the cutoff
304                         it = seqGroup.find(seqA);
305                         it2 = seqGroup.find(seqB);
306                         
307                         if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
308                                 if (it->second == it2->second) { //they are from the same group so add the distance
309                                         openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
310                                         outFile << seqA << '\t' << seqB << '\t' << dist << endl;
311                                         outFile.close();
312                                 }
313                         }
314                 }
315                 dFile.close();
316         
317                 
318                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
319                         remove((namefile + "." + toString(i) + ".temp").c_str());
320                 }
321                 
322                 ifstream bigNameFile;
323                 openInputFile(namefile, bigNameFile);
324                 
325                 singleton = namefile + ".extra.temp";
326                 ofstream remainingNames;
327                 openOutputFile(singleton, remainingNames);
328                 
329                 bool wroteExtra = false;
330                                                 
331                 string name, nameList;
332                 while(!bigNameFile.eof()){
333                         bigNameFile >> name >> nameList;  gobble(bigNameFile);
334                         
335                         //did this sequence get assigned a group
336                         it = seqGroup.find(name);
337                         
338                         if (it != seqGroup.end()) {  
339                                 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
340                                 outFile << name << '\t' << nameList << endl;
341                                 outFile.close();
342                         }else{
343                                 wroteExtra = true;
344                                 remainingNames << name << '\t' << nameList << endl;
345                         }
346                 }
347                 bigNameFile.close();
348                 remainingNames.close();
349                 
350                 if (!wroteExtra) { 
351                         remove(singleton.c_str());
352                         singleton = "none";
353                 }
354                         
355                 for(int i=0;i<numGroups;i++){
356                         string tempNameFile = namefile + "." + toString(i) + ".temp";
357                         string tempDistFile = distFile + "." + toString(i) + ".temp";
358                                 
359                         map<string, string> temp;
360                         temp[tempDistFile] = tempNameFile;
361                         dists.push_back(temp);
362                 }
363                 
364                 if (m->control_pressed)  {  
365                         for (int i = 0; i < dists.size(); i++) { 
366                                 remove((dists[i].begin()->first).c_str());
367                                 remove((dists[i].begin()->second).c_str());
368                         }
369                         dists.clear();
370                 }
371                 
372                 return 0;
373                         
374         }
375         catch(exception& e) {
376                 m->errorOut(e, "SplitMatrix", "splitClassify");
377                 exit(1);
378         }
379 }
380 //********************************************************************************************************************
381 //sorts biggest to smallest
382 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
383         
384         FILE * pFile;
385         long leftsize = 0;
386                 
387         //get num bytes in file
388         string filename = left.begin()->first;
389         pFile = fopen (filename.c_str(),"rb");
390         string error = "Error opening " + filename;
391         if (pFile==NULL) perror (error.c_str());
392         else{
393                 fseek (pFile, 0, SEEK_END);
394                 leftsize=ftell (pFile);
395                 fclose (pFile);
396         }
397
398         FILE * pFile2;
399         long rightsize = 0;
400                 
401         //get num bytes in file
402         filename = right.begin()->first;
403         pFile2 = fopen (filename.c_str(),"rb");
404         error = "Error opening " + filename;
405         if (pFile2==NULL) perror (error.c_str());
406         else{
407                 fseek (pFile2, 0, SEEK_END);
408                 rightsize=ftell (pFile2);
409                 fclose (pFile2);
410         }
411
412         return (leftsize > rightsize);  
413
414 /***********************************************************************/
415 //returns map of distance files -> namefile sorted by distance file size
416 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
417         try {   
418                 
419                 sort(dists.begin(), dists.end(), compareFileSizes);
420                 
421                 return dists;
422         }
423         catch(exception& e) {
424                 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
425                 exit(1);
426         }
427 }
428 /***********************************************************************/
429 SplitMatrix::~SplitMatrix(){}
430 /***********************************************************************/
431