]> git.donarmstrong.com Git - mothur.git/blob - readblast.cpp
moved utilities out of mothur.h and into mothurOut class.
[mothur.git] / readblast.cpp
1 /*
2  *  readblast.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/10/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "readblast.h"
11 #include "progress.hpp"
12
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareOverlap(seqDist left, seqDist right){
16         return (left.dist < right.dist);        
17
18 /*********************************************************************************************/
19 ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
20         try {
21                 m = MothurOut::getInstance();
22                 matrix = NULL;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "ReadBlast", "ReadBlast");
26                 exit(1);
27         }
28
29 /*********************************************************************************************/
30 //assumptions about the blast file: 
31 //1. if duplicate lines occur the first line is always best and is chosen
32 //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
33 int ReadBlast::read(NameAssignment* nameMap) {
34         try {
35         
36                 //if the user has not given a names file read names from blastfile
37                 if (nameMap->size() == 0) { readNames(nameMap);  }
38                 int nseqs = nameMap->size();
39                 
40                 if (m->control_pressed) { return 0; }
41
42                 ifstream fileHandle;
43                 m->openInputFile(blastfile, fileHandle);
44                 
45                 string firstName, secondName, eScore, currentRow;
46                 string repeatName = "";
47                 int count = 1;
48                 float distance, thisoverlap, refScore;
49                 float percentId; 
50                 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
51                 
52                 ofstream outDist;
53                 ofstream outOverlap;
54                 
55                 //create objects needed for read
56                 if (!hclusterWanted) {
57                         matrix = new SparseMatrix();
58                 }else{
59                         overlapFile = m->getRootName(blastfile) + "overlap.dist";
60                         distFile = m->getRootName(blastfile) + "hclusterDists.dist";
61                         
62                         m->openOutputFile(overlapFile, outOverlap);
63                         m->openOutputFile(distFile, outDist);
64                 }
65                 
66                 if (m->control_pressed) { 
67                         fileHandle.close();
68                         if (!hclusterWanted) {  delete matrix; }
69                         else { outOverlap.close(); remove(overlapFile.c_str()); outDist.close(); remove(distFile.c_str());  }
70                         return 0;
71                 }
72                 
73                 Progress* reading = new Progress("Reading blast:     ", nseqs * nseqs);
74                 
75                 //this is used to quickly find if we already have a distance for this combo
76                 vector< map<int,float> > dists;  dists.resize(nseqs);  //dists[0][1] = distance from seq0 to seq1
77                 map<int, float> thisRowsBlastScores;
78                 
79                 if (!fileHandle.eof()) {
80                         //read in line from file
81                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
82                         m->gobble(fileHandle);
83                         
84                         currentRow = firstName;
85                         lengthThisSeq = numBases;
86                         repeatName = firstName + secondName;
87                         
88                         if (firstName == secondName) {   refScore = score;  }
89                         else{
90                                 //convert name to number
91                                 map<string,int>::iterator itA = nameMap->find(firstName);
92                                 map<string,int>::iterator itB = nameMap->find(secondName);
93                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
94                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
95                                 
96                                 thisRowsBlastScores[itB->second] = score;
97                                 
98                                 //calc overlap score
99                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
100                                 
101                                 //if there is a valid overlap, add it
102                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
103                                         if (!hclusterWanted) {
104                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
105                                                 overlap.push_back(overlapValue);
106                                         }else {
107                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
108                                         }
109                                 }
110                         }
111                 }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
112
113                                 
114                 //read file
115                 while(!fileHandle.eof()){  
116                 
117                         if (m->control_pressed) { 
118                                 fileHandle.close();
119                                 if (!hclusterWanted) {  delete matrix; }
120                                 else { outOverlap.close(); remove(overlapFile.c_str()); outDist.close(); remove(distFile.c_str());  }
121                                 delete reading;
122                                 return 0;
123                         }
124                         
125                         //read in line from file
126                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
127                         //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;       
128                         m->gobble(fileHandle);
129                         
130                         string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
131                         
132                         //if this is a new pairing
133                         if (temp != repeatName) {
134                                 repeatName = temp; 
135                                 
136                                 if (currentRow == firstName) {
137                                         //cout << "first = " << firstName << " second = " << secondName << endl;        
138                                         if (firstName == secondName) { 
139                                                 refScore = score;  
140                                                 reading->update((count + nseqs));  
141                                                 count++; 
142                                         }else{
143                                                 //convert name to number
144                                                 map<string,int>::iterator itA = nameMap->find(firstName);
145                                                 map<string,int>::iterator itB = nameMap->find(secondName);
146                                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
147                                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
148                                                 
149                                                 //save score
150                                                 thisRowsBlastScores[itB->second] = score;
151                                                         
152                                                 //calc overlap score
153                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
154                                                 
155                                                 //if there is a valid overlap, add it
156                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
157                                                         if (!hclusterWanted) {
158                                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
159                                                                 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
160                                                                 overlap.push_back(overlapValue);
161                                                         }else {
162                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
163                                                         }
164                                                 }
165                                         } //end else
166                                 }else { //end row
167                                         //convert blast scores to distance and add cell to sparse matrix if we can
168                                         map<int, float>::iterator it;
169                                         map<int, float>::iterator itDist;
170                                         for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
171                                                 distance = 1.0 - (it->second / refScore);
172                 
173                                                 
174                                                 //do we already have the distance calculated for b->a
175                                                 map<string,int>::iterator itA = nameMap->find(currentRow);
176                                                 itDist = dists[it->first].find(itA->second);
177                                                 
178                                                 //if we have it then compare
179                                                 if (itDist != dists[it->first].end()) {
180         
181                                                         //if you want the minimum blast score ratio, then pick max distance
182                                                         if(minWanted) {  distance = max(itDist->second, distance);  }
183                                                         else{   distance = min(itDist->second, distance);  }
184
185                                                         //is this distance below cutoff
186                                                         if (distance < cutoff) {
187                                                                 if (!hclusterWanted) {
188                                                                         PCell value(itA->second, it->first, distance);
189                                                                         matrix->addCell(value);
190                                                                 }else{
191                                                                         outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
192                                                                 }
193                                                         }
194                                                         //not going to need this again
195                                                         dists[it->first].erase(itDist);
196                                                 }else { //save this value until we get the other ratio
197                                                         dists[itA->second][it->first] = distance;
198                                                 }
199                                         }
200                                         //clear out last rows info
201                                         thisRowsBlastScores.clear();
202                                         
203                                         currentRow = firstName;
204                                         lengthThisSeq = numBases;
205                                         
206                                         //add this row to thisRowsBlastScores
207                                         if (firstName == secondName) {   refScore = score;  }
208                                         else{ //add this row to thisRowsBlastScores
209                                                 
210                                                 //convert name to number
211                                                 map<string,int>::iterator itA = nameMap->find(firstName);
212                                                 map<string,int>::iterator itB = nameMap->find(secondName);
213                                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
214                                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
215                                                 
216                                                 thisRowsBlastScores[itB->second] = score;
217                                                 
218                                                 //calc overlap score
219                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
220                                                 
221                                                 //if there is a valid overlap, add it
222                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
223                                                         if (!hclusterWanted) {
224                                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
225                                                                 overlap.push_back(overlapValue);
226                                                         }else {
227                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
228                                                         }
229                                                 }
230                                         }
231                                 }//end if current row
232                         }//end if repeat
233                 }//end while
234                 
235                 //get last rows info stored
236                 //convert blast scores to distance and add cell to sparse matrix if we can
237                 map<int, float>::iterator it;
238                 map<int, float>::iterator itDist;
239                 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
240                         distance = 1.0 - (it->second / refScore);
241                         
242                         //do we already have the distance calculated for b->a
243                         map<string,int>::iterator itA = nameMap->find(currentRow);
244                         itDist = dists[it->first].find(itA->second);
245                         
246                         //if we have it then compare
247                         if (itDist != dists[it->first].end()) {
248                                 //if you want the minimum blast score ratio, then pick max distance
249                                 if(minWanted) {  distance = max(itDist->second, distance);  }
250                                 else{   distance = min(itDist->second, distance);  }
251                                 
252                                 //is this distance below cutoff
253                                 if (distance < cutoff) {
254                                         if (!hclusterWanted) {
255                                                 PCell value(itA->second, it->first, distance);
256                                                 matrix->addCell(value);
257                                         }else{
258                                                 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
259                                         }
260                                 }
261                                 //not going to need this again
262                                 dists[it->first].erase(itDist);
263                         }else { //save this value until we get the other ratio
264                                 dists[itA->second][it->first] = distance;
265                         }
266                 }
267                 //clear out info
268                 thisRowsBlastScores.clear();
269                 dists.clear();
270                 
271                 if (m->control_pressed) { 
272                                 fileHandle.close();
273                                 if (!hclusterWanted) {  delete matrix; }
274                                 else { outOverlap.close(); remove(overlapFile.c_str()); outDist.close(); remove(distFile.c_str());  }
275                                 delete reading;
276                                 return 0;
277                 }
278                 
279                 if (!hclusterWanted) {
280                         sort(overlap.begin(), overlap.end(), compareOverlap);
281                 }else {
282                         outDist.close();
283                         outOverlap.close();
284                 }
285                 
286                 if (m->control_pressed) { 
287                                 fileHandle.close();
288                                 if (!hclusterWanted) {  delete matrix; }
289                                 else {  remove(overlapFile.c_str());  remove(distFile.c_str());  }
290                                 delete reading;
291                                 return 0;
292                 }
293                 
294                 reading->finish();
295                 delete reading;
296                 fileHandle.close();
297                 
298                 return 0;
299         }
300         catch(exception& e) {
301                 m->errorOut(e, "ReadBlast", "read");
302                 exit(1);
303         }
304
305 /*********************************************************************************************/
306 int ReadBlast::readNames(NameAssignment* nameMap) {
307         try {
308                 m->mothurOut("Reading names... "); cout.flush();
309                 
310                 string name, hold, prevName;
311                 int num = 1;
312                 
313                 ifstream in;
314                 m->openInputFile(blastfile, in);
315                 
316                 //ofstream outName;
317                 //m->openOutputFile((blastfile + ".tempOutNames"), outName);
318                 
319                 //read first line
320                 in >> prevName;
321         
322                 for (int i = 0; i < 11; i++) {  in >> hold;  }
323                 m->gobble(in);
324                                 
325                 //save name in nameMap
326                 nameMap->push_back(prevName);
327                 
328                 while (!in.eof()) {
329                         if (m->control_pressed) { in.close(); return 0; }
330                         
331                         //read line
332                         in >> name;
333         
334                         for (int i = 0; i < 11; i++) {  in >> hold;  }
335                         m->gobble(in);
336                         
337                         //is this a new name?
338                         if (name != prevName) {
339                                 prevName = name;
340                                 nameMap->push_back(name);
341                                 num++;
342                         }
343                 }
344         
345                 in.close();
346                 
347                 //write out names file
348                 //string outNames = m->getRootName(blastfile) + "names";
349                 //ofstream out;
350                 //m->openOutputFile(outNames, out);
351                 //nameMap->print(out);
352                 //out.close();
353                 
354                 if (m->control_pressed) { return 0; }
355                 
356                 m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
357                 
358                 return 0;
359                 
360         }
361         catch(exception& e) {
362                 m->errorOut(e, "ReadBlast", "readNames");
363                 exit(1);
364         }
365
366 /*********************************************************************************************/
367
368
369