]> git.donarmstrong.com Git - mothur.git/blob - readcluster.cpp
fixed bug in hcluster command
[mothur.git] / readcluster.cpp
1 /*
2  *  readcluster.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/28/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "readcluster.h"
11
12 /***********************************************************************/
13
14 ReadCluster::ReadCluster(string distfile, float c){
15                 globaldata = GlobalData::getInstance();
16         distFile = distfile;
17                 cutoff = c;
18 }
19
20 /***********************************************************************/
21
22 void ReadCluster::read(NameAssignment* nameMap){
23         try {
24         
25                 if (format == "phylip") { convertPhylip2Column(nameMap); }
26                 else { list = new ListVector(nameMap->getListVector());  }
27                 
28                 createHClusterFile();
29                         
30         }
31         catch(exception& e) {
32                 errorOut(e, "ReadCluster", "read");
33                 exit(1);
34         }
35 }
36 /***********************************************************************/
37
38 void ReadCluster::createHClusterFile(){
39         try {   
40                 string outfile = getRootName(distFile) + "sorted.dist";
41                 
42                 //if you can, use the unix sort since its been optimized for years
43                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
44                         string command = "sort -n -k +3 " + distFile + " -o " + outfile;
45                         system(command.c_str());
46                 #else //you are stuck with my best attempt...
47                         //windows sort does not have a way to specify a column, only a character in the line
48                         //since we cannot assume that the distance will always be at the the same character location on each line
49                         //due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back.
50                 
51                         //read in file line by file and put distance first
52                         string tempDistFile = distFile + ".temp";
53                         ifstream input;
54                         ofstream output;
55                         openInputFile(distFile, input);
56                         openOutputFile(tempDistFile, output);
57
58                         string firstName, secondName;
59                         float dist;
60                         while (input) {
61                                 input >> firstName >> secondName >> dist;
62                                 output << dist << '\t' << firstName << '\t' << secondName << endl;
63                                 gobble(input);
64                         }
65                         input.close();
66                         output.close();
67                 
68         
69                         //sort using windows sort
70                         string tempOutfile = outfile + ".temp";
71                         string command = "sort " + tempDistFile + " /O " + tempOutfile;
72                         system(command.c_str());
73                 
74                         //read in sorted file and put distance at end again
75                         ifstream input2;
76                         openInputFile(tempOutfile, input2);
77                         openOutputFile(outfile, output);
78                 
79                         while (input2) {
80                                 input2 >> dist >> firstName >> secondName;
81                                 output << firstName << '\t' << secondName << '\t' << dist << endl;
82                                 gobble(input2);
83                         }
84                         input2.close();
85                         output.close();
86                 
87                         //remove temp files
88                         remove(tempDistFile.c_str());
89                         remove(tempOutfile.c_str());
90                 #endif
91                 
92                 OutPutFile = outfile;
93         }
94         catch(exception& e) {
95                 errorOut(e, "ReadCluster", "createHClusterFile");
96                 exit(1);
97         }
98 }
99
100
101 /***********************************************************************/
102
103 void ReadCluster::convertPhylip2Column(NameAssignment* nameMap){
104         try {   
105                 //convert phylip file to column file
106                 map<int, string> rowToName;
107                 map<int, string>::iterator it;
108                 
109                 ifstream in;
110                 ofstream out;
111                 string tempFile = distFile + ".column.temp";
112                 
113                 openInputFile(distFile, in);
114                 openOutputFile(tempFile, out);
115                 
116                 float distance;
117                 int square, nseqs;
118                 string name;
119                 vector<string> matrixNames;
120         
121                 in >> nseqs >> name;
122                 rowToName[0] = name;
123                 matrixNames.push_back(name);
124                 
125                 if(nameMap == NULL){
126                         list = new ListVector(nseqs);
127                         list->set(0, name);
128                 }
129                 else{
130                         list = new ListVector(nameMap->getListVector());
131                         if(nameMap->count(name)==0){        mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); mothurOutEndLine(); }
132                 }
133         
134                 char d;
135                 while((d=in.get()) != EOF){
136                         
137                         if(isalnum(d)){
138                                 square = 1;
139                                 in.putback(d);
140                                 for(int i=0;i<nseqs;i++){
141                                         in >> distance;
142                                 }
143                                 break;
144                         }
145                         if(d == '\n'){
146                                 square = 0;
147                                 break;
148                         }
149                 }
150         
151                 if(square == 0){
152                                         
153                         for(int i=1;i<nseqs;i++){
154                                 in >> name;
155                                 rowToName[i] = name;
156                                 matrixNames.push_back(name);
157                                 
158                                 //there's A LOT of repeated code throughout this method...
159                                 if(nameMap == NULL){
160                                         list->set(i, name);
161                                         
162                                         for(int j=0;j<i;j++){
163                                                 in >> distance;
164                                                 
165                                                 if (distance == -1) { distance = 1000000; }
166                                                 
167                                                 if(distance < cutoff){
168                                                         out << i << '\t' << j << '\t' << distance << endl;
169                                                 }
170                                         }
171                                         
172                                 }
173                                 else{
174                                         if(nameMap->count(name)==0){        mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); mothurOutEndLine(); }
175                                         
176                                         for(int j=0;j<i;j++){
177                                                 in >> distance;
178                                                 
179                                                 if (distance == -1) { distance = 1000000; }
180                                                 
181                                                 if(distance < cutoff){
182                                                         out << i << '\t' << j << '\t' << distance << endl;
183                                                 }
184                                         }
185                                 }
186                         }
187                 }
188                 else{
189                         for(int i=1;i<nseqs;i++){
190                                 in >> name;                
191                                 rowToName[i] = name;
192                                 matrixNames.push_back(name);
193                 
194                                 if(nameMap == NULL){
195                                         list->set(i, name);
196                                         for(int j=0;j<nseqs;j++){
197                                                 in >> distance;
198                                         
199                                                 if (distance == -1) { distance = 1000000; }
200                                                 
201                                                 if(distance < cutoff && j < i){
202                                                         out << i << '\t' << j << '\t' << distance << endl;
203                                                 }
204                                         }
205                                 }
206                                 else{
207                                         if(nameMap->count(name)==0){        mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); mothurOutEndLine(); }
208                                         
209                                         for(int j=0;j<nseqs;j++){
210                                                 in >> distance;
211                         
212                                                 if (distance == -1) { distance = 1000000; }
213                                                 
214                                                 if(distance < cutoff && j < i){
215                                                         out << i << '\t' << j << '\t' << distance << endl;
216                                                 }
217                                                 
218                                         }
219                                 }
220                         }
221                 }
222                 
223                 list->setLabel("0");
224                 in.close();
225                 out.close();
226                 
227                 if(nameMap == NULL){
228                         nameMap = new NameAssignment();
229                         for(int i=0;i<matrixNames.size();i++){
230                                 nameMap->push_back(matrixNames[i]);
231                         }
232                         globaldata->nameMap = nameMap;
233                 }
234                 
235         
236                 ifstream in2;
237                 ofstream out2;
238                 
239                 string outputFile = getRootName(distFile) + "column.dist";
240                 openInputFile(tempFile, in2);
241                 openOutputFile(outputFile, out2);
242                 
243                 int first, second;
244                 float dist;
245                 
246                 while (in2) {
247                         in2 >> first >> second >> dist;
248                         out2 << rowToName[first] << '\t' << rowToName[second] << '\t' << dist << endl;
249                         gobble(in2);
250                 }
251                 in2.close();
252                 out2.close();
253                 
254                 remove(tempFile.c_str());
255                 distFile = outputFile;
256         }
257         catch(exception& e) {
258                 errorOut(e, "ReadCluster", "convertPhylip2Column");
259                 exit(1);
260         }
261 }
262 /***********************************************************************/
263
264 ReadCluster::~ReadCluster(){}
265 /***********************************************************************/
266