]> git.donarmstrong.com Git - mothur.git/blob - readblast.cpp
fixed some bugs and added mgcluster command
[mothur.git] / readblast.cpp
1 /*
2  *  readblast.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/10/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "readblast.h"
11 #include "progress.hpp"
12
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareOverlap(DistNode left, DistNode right){
16         return (left.dist < right.dist);        
17
18 /*********************************************************************************************/
19 ReadBlast::ReadBlast(string file, float c, float p, int l, bool m, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(m), hclusterWanted(h) {
20         try {
21                 matrix = NULL;
22         }
23         catch(exception& e) {
24                 errorOut(e, "ReadBlast", "ReadBlast");
25                 exit(1);
26         }
27
28 /*********************************************************************************************/
29 //assumptions about the blast file: 
30 //1. if duplicate lines occur the first line is always best and is chosen
31 //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
32 void ReadBlast::read(NameAssignment* nameMap) {
33         try {
34         
35                 //if the user has not given a names file read names from blastfile
36                 if (nameMap->size() == 0) { readNames(nameMap);  }
37                 int nseqs = nameMap->size();
38
39                 ifstream fileHandle;
40                 openInputFile(blastfile, fileHandle);
41                 
42                 string firstName, secondName, eScore, currentRow;
43                 string repeatName = "";
44                 int count = 1;
45                 float distance, thisoverlap, refScore;
46                 float percentId; 
47                 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
48                 
49                 ofstream outDist;
50                 ofstream outOverlap;
51                 
52                 //create objects needed for read
53                 if (!hclusterWanted) {
54                         matrix = new SparseMatrix();
55                 }else{
56                         overlapFile = getRootName(blastfile) + "overlap.dist";
57                         distFile = getRootName(blastfile) + "hclusterDists.dist";
58                         
59                         openOutputFile(overlapFile, outOverlap);
60                         openOutputFile(distFile, outDist);
61                 }
62         
63                 Progress* reading = new Progress("Reading blast:     ", nseqs * nseqs);
64                 
65                 //this is used to quickly find if we already have a distance for this combo
66                 vector< map<int,float> > dists;  dists.resize(nseqs);  //dists[0][1] = distance from seq0 to seq1
67                 map<int, float> thisRowsBlastScores;
68                 
69                 if (!fileHandle.eof()) {
70                         //read in line from file
71                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
72                         gobble(fileHandle);
73                         
74                         currentRow = firstName;
75                         lengthThisSeq = numBases;
76                         repeatName = firstName + secondName;
77                         
78                         if (firstName == secondName) {   refScore = score;  }
79                         else{
80                                 //convert name to number
81                                 map<string,int>::iterator itA = nameMap->find(firstName);
82                                 map<string,int>::iterator itB = nameMap->find(secondName);
83                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
84                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
85                                 
86                                 thisRowsBlastScores[itB->second] = score;
87                                 
88                                 //calc overlap score
89                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
90                                 
91                                 //if there is a valid overlap, add it
92                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
93                                         if (!hclusterWanted) {
94                                                 DistNode overlapValue(itA->second, itB->second, thisoverlap);
95                                                 overlap.push_back(overlapValue);
96                                         }else {
97                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
98                                         }
99                                 }
100                         }
101                 }else { mothurOut("Error in your blast file, cannot read."); mothurOutEndLine(); exit(1); }
102
103                                 
104                 //read file
105                 while(!fileHandle.eof()){  
106                         
107                         //read in line from file
108                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
109                         //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;       
110                         gobble(fileHandle);
111                         
112                         string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
113                         
114                         //if this is a new pairing
115                         if (temp != repeatName) {
116                                 repeatName = temp; 
117                                 
118                                 if (currentRow == firstName) {
119                                         //cout << "first = " << firstName << " second = " << secondName << endl;        
120                                         if (firstName == secondName) { 
121                                                 refScore = score;  
122                                                 reading->update((count + nseqs));  
123                                                 count++; 
124                                         }else{
125                                                 //convert name to number
126                                                 map<string,int>::iterator itA = nameMap->find(firstName);
127                                                 map<string,int>::iterator itB = nameMap->find(secondName);
128                                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
129                                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
130                                                 
131                                                 //save score
132                                                 thisRowsBlastScores[itB->second] = score;
133                                                         
134                                                 //calc overlap score
135                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
136                                                 
137                                                 //if there is a valid overlap, add it
138                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
139                                                         if (!hclusterWanted) {
140                                                                 DistNode overlapValue(itA->second, itB->second, thisoverlap);
141                                                                 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
142                                                                 overlap.push_back(overlapValue);
143                                                         }else {
144                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
145                                                         }
146                                                 }
147                                         } //end else
148                                 }else { //end row
149                                         //convert blast scores to distance and add cell to sparse matrix if we can
150                                         map<int, float>::iterator it;
151                                         map<int, float>::iterator itDist;
152                                         for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
153                                                 distance = 1.0 - (it->second / refScore);
154                                                 
155                                                 //do we already have the distance calculated for b->a
156                                                 map<string,int>::iterator itA = nameMap->find(currentRow);
157                                                 itDist = dists[it->first].find(itA->second);
158                                                 
159                                                 //if we have it then compare
160                                                 if (itDist != dists[it->first].end()) {
161                                                         //if you want the minimum blast score ratio, then pick max distance
162                                                         if(minWanted) {  distance = max(itDist->second, distance);  }
163                                                         else{   distance = min(itDist->second, distance);  }
164                                                         
165                                                         //is this distance below cutoff
166                                                         if (distance < cutoff) {
167                                                                 if (!hclusterWanted) {
168                                                                         PCell value(itA->second, it->first, distance);
169                                                                         matrix->addCell(value);
170                                                                 }else{
171                                                                         outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
172                                                                 }
173                                                         }
174                                                         //not going to need this again
175                                                         dists[it->first].erase(itDist);
176                                                 }else { //save this value until we get the other ratio
177                                                         dists[itA->second][it->first] = distance;
178                                                 }
179                                         }
180                                         //clear out last rows info
181                                         thisRowsBlastScores.clear();
182                                         
183                                         currentRow = firstName;
184                                         lengthThisSeq = numBases;
185                                         
186                                         //add this row to thisRowsBlastScores
187                                         if (firstName == secondName) {   refScore = score;  }
188                                         else{ //add this row to thisRowsBlastScores
189                                                 
190                                                 //convert name to number
191                                                 map<string,int>::iterator itA = nameMap->find(firstName);
192                                                 map<string,int>::iterator itB = nameMap->find(secondName);
193                                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
194                                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
195                                                 
196                                                 thisRowsBlastScores[itB->second] = score;
197                                                 
198                                                 //calc overlap score
199                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
200                                                 
201                                                 //if there is a valid overlap, add it
202                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
203                                                         if (!hclusterWanted) {
204                                                                 DistNode overlapValue(itA->second, itB->second, thisoverlap);
205                                                                 overlap.push_back(overlapValue);
206                                                         }else {
207                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
208                                                         }
209                                                 }
210                                         }
211                                 }//end if current row
212                         }//end if repeat
213                 }//end while
214                 
215                 //get last rows info stored
216                 //convert blast scores to distance and add cell to sparse matrix if we can
217                 map<int, float>::iterator it;
218                 map<int, float>::iterator itDist;
219                 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
220                         distance = 1.0 - (it->second / refScore);
221                         
222                         //do we already have the distance calculated for b->a
223                         map<string,int>::iterator itA = nameMap->find(currentRow);
224                         itDist = dists[it->first].find(itA->second);
225                         
226                         //if we have it then compare
227                         if (itDist != dists[it->first].end()) {
228                                 //if you want the minimum blast score ratio, then pick max distance
229                                 if(minWanted) {  distance = max(itDist->second, distance);  }
230                                 else{   distance = min(itDist->second, distance);  }
231                                 
232                                 //is this distance below cutoff
233                                 if (distance < cutoff) {
234                                         if (!hclusterWanted) {
235                                                 PCell value(itA->second, it->first, distance);
236                                                 matrix->addCell(value);
237                                         }else{
238                                                 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
239                                         }
240                                 }
241                                 //not going to need this again
242                                 dists[it->first].erase(itDist);
243                         }else { //save this value until we get the other ratio
244                                 dists[itA->second][it->first] = distance;
245                         }
246                 }
247                 //clear out info
248                 thisRowsBlastScores.clear();
249                 dists.clear();
250                 
251                 if (!hclusterWanted) {
252                         sort(overlap.begin(), overlap.end(), compareOverlap);
253                 }else {
254                         outDist.close();
255                         outOverlap.close();
256                 }
257                 
258                 reading->finish();
259                 delete reading;
260                 fileHandle.close();
261         }
262         catch(exception& e) {
263                 errorOut(e, "ReadBlast", "read");
264                 exit(1);
265         }
266
267 /*********************************************************************************************/
268 void ReadBlast::readNames(NameAssignment* nameMap) {
269         try {
270                 mothurOut("Reading names... "); cout.flush();
271                 
272                 string name, hold, prevName;
273                 int num = 1;
274                 
275                 ifstream in;
276                 openInputFile(blastfile, in);
277                 
278                 //read first line
279                 in >> prevName;
280                 for (int i = 0; i < 11; i++) {  in >> hold;  }
281                 gobble(in);
282                 
283                 //save name in nameMap
284                 nameMap->push_back(prevName);
285                 
286                 while (!in.eof()) {
287                         
288                         //read line
289                         in >> name;
290                         for (int i = 0; i < 11; i++) {  in >> hold;  }
291                         gobble(in);
292                         
293                         //is this a new name?
294                         if (name != prevName) {
295                                 prevName = name;
296                                 nameMap->push_back(name);
297                                 num++;
298                         }
299                 }
300         
301                 in.close();
302                 
303                 //write out names file
304                 //string outNames = getRootName(blastfile) + "names";
305                 //ofstream out;
306                 //openOutputFile(outNames, out);
307                 //nameMap->print(out);
308                 //out.close();
309                 
310                 mothurOut(toString(num) + " names read."); mothurOutEndLine();
311                 
312         }
313         catch(exception& e) {
314                 errorOut(e, "ReadBlast", "readNames");
315                 exit(1);
316         }
317
318 /*********************************************************************************************/
319
320
321