]> git.donarmstrong.com Git - mothur.git/blob - readblast.cpp
created mothurOut class to handle logfiles
[mothur.git] / readblast.cpp
1 /*
2  *  readblast.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/10/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "readblast.h"
11 #include "progress.hpp"
12
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareOverlap(seqDist left, seqDist right){
16         return (left.dist < right.dist);        
17
18 /*********************************************************************************************/
19 ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
20         try {
21                 m = MothurOut::getInstance();
22                 matrix = NULL;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "ReadBlast", "ReadBlast");
26                 exit(1);
27         }
28
29 /*********************************************************************************************/
30 //assumptions about the blast file: 
31 //1. if duplicate lines occur the first line is always best and is chosen
32 //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
33 void ReadBlast::read(NameAssignment* nameMap) {
34         try {
35         
36                 //if the user has not given a names file read names from blastfile
37                 if (nameMap->size() == 0) { readNames(nameMap);  }
38                 int nseqs = nameMap->size();
39
40                 ifstream fileHandle;
41                 openInputFile(blastfile, fileHandle);
42                 
43                 string firstName, secondName, eScore, currentRow;
44                 string repeatName = "";
45                 int count = 1;
46                 float distance, thisoverlap, refScore;
47                 float percentId; 
48                 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
49                 
50                 ofstream outDist;
51                 ofstream outOverlap;
52                 
53                 //create objects needed for read
54                 if (!hclusterWanted) {
55                         matrix = new SparseMatrix();
56                 }else{
57                         overlapFile = getRootName(blastfile) + "overlap.dist";
58                         distFile = getRootName(blastfile) + "hclusterDists.dist";
59                         
60                         openOutputFile(overlapFile, outOverlap);
61                         openOutputFile(distFile, outDist);
62                 }
63         
64                 Progress* reading = new Progress("Reading blast:     ", nseqs * nseqs);
65                 
66                 //this is used to quickly find if we already have a distance for this combo
67                 vector< map<int,float> > dists;  dists.resize(nseqs);  //dists[0][1] = distance from seq0 to seq1
68                 map<int, float> thisRowsBlastScores;
69                 
70                 if (!fileHandle.eof()) {
71                         //read in line from file
72                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
73                         gobble(fileHandle);
74                         
75                         currentRow = firstName;
76                         lengthThisSeq = numBases;
77                         repeatName = firstName + secondName;
78                         
79                         if (firstName == secondName) {   refScore = score;  }
80                         else{
81                                 //convert name to number
82                                 map<string,int>::iterator itA = nameMap->find(firstName);
83                                 map<string,int>::iterator itB = nameMap->find(secondName);
84                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
85                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
86                                 
87                                 thisRowsBlastScores[itB->second] = score;
88                                 
89                                 //calc overlap score
90                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
91                                 
92                                 //if there is a valid overlap, add it
93                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
94                                         if (!hclusterWanted) {
95                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
96                                                 overlap.push_back(overlapValue);
97                                         }else {
98                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
99                                         }
100                                 }
101                         }
102                 }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
103
104                                 
105                 //read file
106                 while(!fileHandle.eof()){  
107                         
108                         //read in line from file
109                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
110                         //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;       
111                         gobble(fileHandle);
112                         
113                         string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
114                         
115                         //if this is a new pairing
116                         if (temp != repeatName) {
117                                 repeatName = temp; 
118                                 
119                                 if (currentRow == firstName) {
120                                         //cout << "first = " << firstName << " second = " << secondName << endl;        
121                                         if (firstName == secondName) { 
122                                                 refScore = score;  
123                                                 reading->update((count + nseqs));  
124                                                 count++; 
125                                         }else{
126                                                 //convert name to number
127                                                 map<string,int>::iterator itA = nameMap->find(firstName);
128                                                 map<string,int>::iterator itB = nameMap->find(secondName);
129                                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
130                                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
131                                                 
132                                                 //save score
133                                                 thisRowsBlastScores[itB->second] = score;
134                                                         
135                                                 //calc overlap score
136                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
137                                                 
138                                                 //if there is a valid overlap, add it
139                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
140                                                         if (!hclusterWanted) {
141                                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
142                                                                 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
143                                                                 overlap.push_back(overlapValue);
144                                                         }else {
145                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
146                                                         }
147                                                 }
148                                         } //end else
149                                 }else { //end row
150                                         //convert blast scores to distance and add cell to sparse matrix if we can
151                                         map<int, float>::iterator it;
152                                         map<int, float>::iterator itDist;
153                                         for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
154                                                 distance = 1.0 - (it->second / refScore);
155                                                 
156                                                 //do we already have the distance calculated for b->a
157                                                 map<string,int>::iterator itA = nameMap->find(currentRow);
158                                                 itDist = dists[it->first].find(itA->second);
159                                                 
160                                                 //if we have it then compare
161                                                 if (itDist != dists[it->first].end()) {
162                                                         //if you want the minimum blast score ratio, then pick max distance
163                                                         if(minWanted) {  distance = max(itDist->second, distance);  }
164                                                         else{   distance = min(itDist->second, distance);  }
165                                                         
166                                                         //is this distance below cutoff
167                                                         if (distance < cutoff) {
168                                                                 if (!hclusterWanted) {
169                                                                         PCell value(itA->second, it->first, distance);
170                                                                         matrix->addCell(value);
171                                                                 }else{
172                                                                         outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
173                                                                 }
174                                                         }
175                                                         //not going to need this again
176                                                         dists[it->first].erase(itDist);
177                                                 }else { //save this value until we get the other ratio
178                                                         dists[itA->second][it->first] = distance;
179                                                 }
180                                         }
181                                         //clear out last rows info
182                                         thisRowsBlastScores.clear();
183                                         
184                                         currentRow = firstName;
185                                         lengthThisSeq = numBases;
186                                         
187                                         //add this row to thisRowsBlastScores
188                                         if (firstName == secondName) {   refScore = score;  }
189                                         else{ //add this row to thisRowsBlastScores
190                                                 
191                                                 //convert name to number
192                                                 map<string,int>::iterator itA = nameMap->find(firstName);
193                                                 map<string,int>::iterator itB = nameMap->find(secondName);
194                                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
195                                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
196                                                 
197                                                 thisRowsBlastScores[itB->second] = score;
198                                                 
199                                                 //calc overlap score
200                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
201                                                 
202                                                 //if there is a valid overlap, add it
203                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
204                                                         if (!hclusterWanted) {
205                                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
206                                                                 overlap.push_back(overlapValue);
207                                                         }else {
208                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
209                                                         }
210                                                 }
211                                         }
212                                 }//end if current row
213                         }//end if repeat
214                 }//end while
215                 
216                 //get last rows info stored
217                 //convert blast scores to distance and add cell to sparse matrix if we can
218                 map<int, float>::iterator it;
219                 map<int, float>::iterator itDist;
220                 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
221                         distance = 1.0 - (it->second / refScore);
222                         
223                         //do we already have the distance calculated for b->a
224                         map<string,int>::iterator itA = nameMap->find(currentRow);
225                         itDist = dists[it->first].find(itA->second);
226                         
227                         //if we have it then compare
228                         if (itDist != dists[it->first].end()) {
229                                 //if you want the minimum blast score ratio, then pick max distance
230                                 if(minWanted) {  distance = max(itDist->second, distance);  }
231                                 else{   distance = min(itDist->second, distance);  }
232                                 
233                                 //is this distance below cutoff
234                                 if (distance < cutoff) {
235                                         if (!hclusterWanted) {
236                                                 PCell value(itA->second, it->first, distance);
237                                                 matrix->addCell(value);
238                                         }else{
239                                                 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
240                                         }
241                                 }
242                                 //not going to need this again
243                                 dists[it->first].erase(itDist);
244                         }else { //save this value until we get the other ratio
245                                 dists[itA->second][it->first] = distance;
246                         }
247                 }
248                 //clear out info
249                 thisRowsBlastScores.clear();
250                 dists.clear();
251                 
252                 if (!hclusterWanted) {
253                         sort(overlap.begin(), overlap.end(), compareOverlap);
254                 }else {
255                         outDist.close();
256                         outOverlap.close();
257                 }
258                 
259                 reading->finish();
260                 delete reading;
261                 fileHandle.close();
262         }
263         catch(exception& e) {
264                 m->errorOut(e, "ReadBlast", "read");
265                 exit(1);
266         }
267
268 /*********************************************************************************************/
269 void ReadBlast::readNames(NameAssignment* nameMap) {
270         try {
271                 m->mothurOut("Reading names... "); cout.flush();
272                 
273                 string name, hold, prevName;
274                 int num = 1;
275                 
276                 ifstream in;
277                 openInputFile(blastfile, in);
278                 
279                 //read first line
280                 in >> prevName;
281                 for (int i = 0; i < 11; i++) {  in >> hold;  }
282                 gobble(in);
283                 
284                 //save name in nameMap
285                 nameMap->push_back(prevName);
286                 
287                 while (!in.eof()) {
288                         
289                         //read line
290                         in >> name;
291                         for (int i = 0; i < 11; i++) {  in >> hold;  }
292                         gobble(in);
293                         
294                         //is this a new name?
295                         if (name != prevName) {
296                                 prevName = name;
297                                 nameMap->push_back(name);
298                                 num++;
299                         }
300                 }
301         
302                 in.close();
303                 
304                 //write out names file
305                 //string outNames = getRootName(blastfile) + "names";
306                 //ofstream out;
307                 //openOutputFile(outNames, out);
308                 //nameMap->print(out);
309                 //out.close();
310                 
311                 m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
312                 
313         }
314         catch(exception& e) {
315                 m->errorOut(e, "ReadBlast", "readNames");
316                 exit(1);
317         }
318
319 /*********************************************************************************************/
320
321
322