]> git.donarmstrong.com Git - mothur.git/blob - readcluster.cpp
added hcluster command and fixed some bugs, namely one with smart distancing.
[mothur.git] / readcluster.cpp
1 /*
2  *  readcluster.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/28/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "readcluster.h"
11
12 /***********************************************************************/
13
14 ReadCluster::ReadCluster(string distfile, float c){
15         distFile = distfile;
16                 cutoff = c;
17 }
18
19 /***********************************************************************/
20
21 void ReadCluster::read(NameAssignment* nameMap){
22         try {
23         
24                 if (format == "phylip") { convertPhylip2Column(nameMap); }
25                 else { list = new ListVector(nameMap->getListVector());  }
26                 
27                 createHClusterFile();
28                         
29         }
30         catch(exception& e) {
31                 errorOut(e, "ReadCluster", "read");
32                 exit(1);
33         }
34 }
35 /***********************************************************************/
36
37 void ReadCluster::createHClusterFile(){
38         try {   
39                 string outfile = getRootName(distFile) + "sorted.dist";
40                 
41                 //if you can, use the unix sort since its been optimized for years
42                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
43                         string command = "sort -n -k +3 " + distFile + " -o " + outfile;
44                         system(command.c_str());
45                 #else //you are stuck with my best attempt...
46                         //windows sort does not have a way to specify a column, only a character in the line
47                         //since we cannot assume that the distance will always be at the the same character location on each line
48                         //due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back.
49                 
50                         //read in file line by file and put distance first
51                         string tempDistFile = distFile + ".temp";
52                         ifstream input;
53                         ofstream output;
54                         openInputFile(distFile, input);
55                         openOutputFile(tempDistFile, output);
56
57                         string firstName, secondName;
58                         float dist;
59                         while (input) {
60                                 input >> firstName >> secondName >> dist;
61                                 output << dist << '\t' << firstName << '\t' << secondName << endl;
62                                 gobble(input);
63                         }
64                         input.close();
65                         output.close();
66                 
67         
68                         //sort using windows sort
69                         string tempOutfile = outfile + ".temp";
70                         string command = "sort " + tempDistFile + " /O " + tempOutfile;
71                         system(command.c_str());
72                 
73                         //read in sorted file and put distance at end again
74                         ifstream input2;
75                         openInputFile(tempOutfile, input2);
76                         openOutputFile(outfile, output);
77                 
78                         while (input2) {
79                                 input2 >> dist >> firstName >> secondName;
80                                 output << firstName << '\t' << secondName << '\t' << dist << endl;
81                                 gobble(input2);
82                         }
83                         input2.close();
84                         output.close();
85                 
86                         //remove temp files
87                         remove(tempDistFile.c_str());
88                         remove(tempOutfile.c_str());
89                 #endif
90                 
91                 OutPutFile = outfile;
92         }
93         catch(exception& e) {
94                 errorOut(e, "ReadCluster", "createHClusterFile");
95                 exit(1);
96         }
97 }
98
99
100 /***********************************************************************/
101
102 void ReadCluster::convertPhylip2Column(NameAssignment* nameMap){
103         try {   
104                 //convert phylip file to column file
105                 map<int, string> rowToName;
106                 map<int, string>::iterator it;
107                 
108                 ifstream in;
109                 ofstream out;
110                 string tempFile = distFile + ".column.temp";
111                 
112                 openInputFile(distFile, in);
113                 openOutputFile(tempFile, out);
114                 
115                 float distance;
116                 int square, nseqs;
117                 string name;
118                 vector<string> matrixNames;
119         
120                 in >> nseqs >> name;
121                 rowToName[0] = name;
122                 matrixNames.push_back(name);
123                 
124                 if(nameMap == NULL){
125                         list = new ListVector(nseqs);
126                         list->set(0, name);
127                 }
128                 else{
129                         list = new ListVector(nameMap->getListVector());
130                         if(nameMap->count(name)==0){        mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); mothurOutEndLine(); }
131                 }
132         
133                 char d;
134                 while((d=in.get()) != EOF){
135                         
136                         if(isalnum(d)){
137                                 square = 1;
138                                 in.putback(d);
139                                 for(int i=0;i<nseqs;i++){
140                                         in >> distance;
141                                 }
142                                 break;
143                         }
144                         if(d == '\n'){
145                                 square = 0;
146                                 break;
147                         }
148                 }
149         
150                 if(square == 0){
151                                         
152                         for(int i=1;i<nseqs;i++){
153                                 in >> name;
154                                 rowToName[i] = name;
155                                 matrixNames.push_back(name);
156                                 
157                                 //there's A LOT of repeated code throughout this method...
158                                 if(nameMap == NULL){
159                                         list->set(i, name);
160                                         
161                                         for(int j=0;j<i;j++){
162                                                 in >> distance;
163                                                 
164                                                 if (distance == -1) { distance = 1000000; }
165                                                 
166                                                 if(distance < cutoff){
167                                                         out << i << '\t' << j << '\t' << distance << endl;
168                                                 }
169                                         }
170                                         
171                                 }
172                                 else{
173                                         if(nameMap->count(name)==0){        mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); mothurOutEndLine(); }
174                                         
175                                         for(int j=0;j<i;j++){
176                                                 in >> distance;
177                                                 
178                                                 if (distance == -1) { distance = 1000000; }
179                                                 
180                                                 if(distance < cutoff){
181                                                         out << i << '\t' << j << '\t' << distance << endl;
182                                                 }
183                                         }
184                                 }
185                         }
186                 }
187                 else{
188                         
189                         for(int i=1;i<nseqs;i++){
190                                 in >> name;                
191                                 rowToName[i] = name;
192                                 matrixNames.push_back(name);
193                                 
194                                 if(nameMap == NULL){
195                                         list->set(i, name);
196                                         for(int j=0;j<nseqs;j++){
197                                                 in >> distance;
198                                                 
199                                                 if (distance == -1) { distance = 1000000; }
200                                                 
201                                                 if(distance < cutoff && j < i){
202                                                         out << i << '\t' << j << '\t' << distance << endl;
203                                                 }
204                                         }
205                                         
206                                 }
207                                 else{
208                                         if(nameMap->count(name)==0){        mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); mothurOutEndLine(); }
209                                         
210                                         for(int j=0;j<nseqs;j++){
211                                                 in >> distance;
212                         
213                                                 if (distance == -1) { distance = 1000000; }
214                                                 
215                                                 if(distance < cutoff && j < i){
216                                                         out << i << '\t' << j << '\t' << distance << endl;
217                                                 }
218                                                 
219                                         }
220                                 }
221                         }
222                 }
223                 
224                 list->setLabel("0");
225                 in.close();
226                 out.close();
227                 
228                 if(nameMap == NULL){
229                         for(int i=0;i<matrixNames.size();i++){
230                                 nameMap->push_back(matrixNames[i]);
231                         }
232                 }
233                 
234                 ifstream in2;
235                 ofstream out2;
236                 
237                 string outputFile = getRootName(distFile) + "column.dist";
238                 openInputFile(tempFile, in2);
239                 openOutputFile(outputFile, out2);
240                 
241                 int first, second;
242                 float dist;
243                 
244                 while (in2) {
245                         in2 >> first >> second >> dist;
246                         out2 << rowToName[first] << '\t' << rowToName[second] << '\t' << dist << endl;
247                         gobble(in2);
248                 }
249                 in2.close();
250                 out2.close();
251                 
252                 remove(tempFile.c_str());
253                 distFile = outputFile;
254         }
255         catch(exception& e) {
256                 errorOut(e, "ReadCluster", "convertPhylip2Column");
257                 exit(1);
258         }
259 }
260 /***********************************************************************/
261
262 ReadCluster::~ReadCluster(){}
263 /***********************************************************************/
264