]> git.donarmstrong.com Git - mothur.git/blob - readblast.cpp
committing so I can work on the other machine
[mothur.git] / readblast.cpp
1 /*
2  *  readblast.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/10/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "readblast.h"
11 #include "progress.hpp"
12
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareOverlap(seqDist left, seqDist right){
16         return (left.dist < right.dist);        
17
18 /*********************************************************************************************/
19 ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
20         try {
21                 m = MothurOut::getInstance();
22                 matrix = NULL;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "ReadBlast", "ReadBlast");
26                 exit(1);
27         }
28
29 /*********************************************************************************************/
30 //assumptions about the blast file: 
31 //1. if duplicate lines occur the first line is always best and is chosen
32 //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
33 int ReadBlast::read(NameAssignment* nameMap) {
34         try {
35         
36                 //if the user has not given a names file read names from blastfile
37                 if (nameMap->size() == 0) { readNames(nameMap);  }
38                 int nseqs = nameMap->size();
39                 
40                 if (m->control_pressed) { return 0; }
41
42                 ifstream fileHandle;
43                 openInputFile(blastfile, fileHandle);
44                 
45                 string firstName, secondName, eScore, currentRow;
46                 string repeatName = "";
47                 int count = 1;
48                 float distance, thisoverlap, refScore;
49                 float percentId; 
50                 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
51                 
52                 ofstream outDist;
53                 ofstream outOverlap;
54                 
55                 //create objects needed for read
56                 if (!hclusterWanted) {
57                         matrix = new SparseMatrix();
58                 }else{
59                         overlapFile = getRootName(blastfile) + "overlap.dist";
60                         distFile = getRootName(blastfile) + "hclusterDists.dist";
61                         
62                         openOutputFile(overlapFile, outOverlap);
63                         openOutputFile(distFile, outDist);
64                 }
65                 
66                 if (m->control_pressed) { 
67                         fileHandle.close();
68                         if (!hclusterWanted) {  delete matrix; }
69                         else { outOverlap.close(); remove(overlapFile.c_str()); outDist.close(); remove(distFile.c_str());  }
70                         return 0;
71                 }
72                 
73                 Progress* reading = new Progress("Reading blast:     ", nseqs * nseqs);
74                 
75                 //this is used to quickly find if we already have a distance for this combo
76                 vector< map<int,float> > dists;  dists.resize(nseqs);  //dists[0][1] = distance from seq0 to seq1
77                 map<int, float> thisRowsBlastScores;
78                 
79                 if (!fileHandle.eof()) {
80                         //read in line from file
81                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
82                         gobble(fileHandle);
83                         
84                         currentRow = firstName;
85                         lengthThisSeq = numBases;
86                         repeatName = firstName + secondName;
87                         
88                         if (firstName == secondName) {   refScore = score;  }
89                         else{
90                                 //convert name to number
91                                 map<string,int>::iterator itA = nameMap->find(firstName);
92                                 map<string,int>::iterator itB = nameMap->find(secondName);
93                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
94                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
95                                 
96                                 thisRowsBlastScores[itB->second] = score;
97                                 
98                                 //calc overlap score
99                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
100                                 
101                                 //if there is a valid overlap, add it
102                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
103                                         if (!hclusterWanted) {
104                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
105                                                 overlap.push_back(overlapValue);
106                                         }else {
107                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
108                                         }
109                                 }
110                         }
111                 }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
112
113                                 
114                 //read file
115                 while(!fileHandle.eof()){  
116                 
117                         if (m->control_pressed) { 
118                                 fileHandle.close();
119                                 if (!hclusterWanted) {  delete matrix; }
120                                 else { outOverlap.close(); remove(overlapFile.c_str()); outDist.close(); remove(distFile.c_str());  }
121                                 delete reading;
122                                 return 0;
123                         }
124                         
125                         //read in line from file
126                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
127                         //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;       
128                         gobble(fileHandle);
129                         
130                         string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
131                         
132                         //if this is a new pairing
133                         if (temp != repeatName) {
134                                 repeatName = temp; 
135                                 
136                                 if (currentRow == firstName) {
137                                         //cout << "first = " << firstName << " second = " << secondName << endl;        
138                                         if (firstName == secondName) { 
139                                                 refScore = score;  
140                                                 reading->update((count + nseqs));  
141                                                 count++; 
142                                         }else{
143                                                 //convert name to number
144                                                 map<string,int>::iterator itA = nameMap->find(firstName);
145                                                 map<string,int>::iterator itB = nameMap->find(secondName);
146                                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
147                                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
148                                                 
149                                                 //save score
150                                                 thisRowsBlastScores[itB->second] = score;
151                                                         
152                                                 //calc overlap score
153                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
154                                                 
155                                                 //if there is a valid overlap, add it
156                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
157                                                         if (!hclusterWanted) {
158                                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
159                                                                 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
160                                                                 overlap.push_back(overlapValue);
161                                                         }else {
162                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
163                                                         }
164                                                 }
165                                         } //end else
166                                 }else { //end row
167                                         //convert blast scores to distance and add cell to sparse matrix if we can
168                                         map<int, float>::iterator it;
169                                         map<int, float>::iterator itDist;
170                                         for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
171                                                 distance = 1.0 - (it->second / refScore);
172                                                 
173                                                 //do we already have the distance calculated for b->a
174                                                 map<string,int>::iterator itA = nameMap->find(currentRow);
175                                                 itDist = dists[it->first].find(itA->second);
176                                                 
177                                                 //if we have it then compare
178                                                 if (itDist != dists[it->first].end()) {
179                                                         //if you want the minimum blast score ratio, then pick max distance
180                                                         if(minWanted) {  distance = max(itDist->second, distance);  }
181                                                         else{   distance = min(itDist->second, distance);  }
182                                                         
183                                                         //is this distance below cutoff
184                                                         if (distance < cutoff) {
185                                                                 if (!hclusterWanted) {
186                                                                         PCell value(itA->second, it->first, distance);
187                                                                         matrix->addCell(value);
188                                                                 }else{
189                                                                         outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
190                                                                 }
191                                                         }
192                                                         //not going to need this again
193                                                         dists[it->first].erase(itDist);
194                                                 }else { //save this value until we get the other ratio
195                                                         dists[itA->second][it->first] = distance;
196                                                 }
197                                         }
198                                         //clear out last rows info
199                                         thisRowsBlastScores.clear();
200                                         
201                                         currentRow = firstName;
202                                         lengthThisSeq = numBases;
203                                         
204                                         //add this row to thisRowsBlastScores
205                                         if (firstName == secondName) {   refScore = score;  }
206                                         else{ //add this row to thisRowsBlastScores
207                                                 
208                                                 //convert name to number
209                                                 map<string,int>::iterator itA = nameMap->find(firstName);
210                                                 map<string,int>::iterator itB = nameMap->find(secondName);
211                                                 if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
212                                                 if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
213                                                 
214                                                 thisRowsBlastScores[itB->second] = score;
215                                                 
216                                                 //calc overlap score
217                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
218                                                 
219                                                 //if there is a valid overlap, add it
220                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
221                                                         if (!hclusterWanted) {
222                                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
223                                                                 overlap.push_back(overlapValue);
224                                                         }else {
225                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
226                                                         }
227                                                 }
228                                         }
229                                 }//end if current row
230                         }//end if repeat
231                 }//end while
232                 
233                 //get last rows info stored
234                 //convert blast scores to distance and add cell to sparse matrix if we can
235                 map<int, float>::iterator it;
236                 map<int, float>::iterator itDist;
237                 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
238                         distance = 1.0 - (it->second / refScore);
239                         
240                         //do we already have the distance calculated for b->a
241                         map<string,int>::iterator itA = nameMap->find(currentRow);
242                         itDist = dists[it->first].find(itA->second);
243                         
244                         //if we have it then compare
245                         if (itDist != dists[it->first].end()) {
246                                 //if you want the minimum blast score ratio, then pick max distance
247                                 if(minWanted) {  distance = max(itDist->second, distance);  }
248                                 else{   distance = min(itDist->second, distance);  }
249                                 
250                                 //is this distance below cutoff
251                                 if (distance < cutoff) {
252                                         if (!hclusterWanted) {
253                                                 PCell value(itA->second, it->first, distance);
254                                                 matrix->addCell(value);
255                                         }else{
256                                                 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
257                                         }
258                                 }
259                                 //not going to need this again
260                                 dists[it->first].erase(itDist);
261                         }else { //save this value until we get the other ratio
262                                 dists[itA->second][it->first] = distance;
263                         }
264                 }
265                 //clear out info
266                 thisRowsBlastScores.clear();
267                 dists.clear();
268                 
269                 if (m->control_pressed) { 
270                                 fileHandle.close();
271                                 if (!hclusterWanted) {  delete matrix; }
272                                 else { outOverlap.close(); remove(overlapFile.c_str()); outDist.close(); remove(distFile.c_str());  }
273                                 delete reading;
274                                 return 0;
275                 }
276                 
277                 if (!hclusterWanted) {
278                         sort(overlap.begin(), overlap.end(), compareOverlap);
279                 }else {
280                         outDist.close();
281                         outOverlap.close();
282                 }
283                 
284                 if (m->control_pressed) { 
285                                 fileHandle.close();
286                                 if (!hclusterWanted) {  delete matrix; }
287                                 else {  remove(overlapFile.c_str());  remove(distFile.c_str());  }
288                                 delete reading;
289                                 return 0;
290                 }
291                 
292                 reading->finish();
293                 delete reading;
294                 fileHandle.close();
295                 
296                 return 0;
297         }
298         catch(exception& e) {
299                 m->errorOut(e, "ReadBlast", "read");
300                 exit(1);
301         }
302
303 /*********************************************************************************************/
304 int ReadBlast::readNames(NameAssignment* nameMap) {
305         try {
306                 m->mothurOut("Reading names... "); cout.flush();
307                 
308                 string name, hold, prevName;
309                 int num = 1;
310                 
311                 ifstream in;
312                 openInputFile(blastfile, in);
313                 
314                 ofstream outName;
315                 openOutputFile("tempOutNames", outName);
316                 
317                 //read first line
318                 in >> prevName;
319         
320                 for (int i = 0; i < 11; i++) {  in >> hold;  }
321                 gobble(in);
322                                 
323                 //save name in nameMap
324                 nameMap->push_back(prevName);
325                 
326                 while (!in.eof()) {
327                         if (m->control_pressed) { in.close(); return 0; }
328                         
329                         //read line
330                         in >> name;
331         
332                         for (int i = 0; i < 11; i++) {  in >> hold;  }
333                         gobble(in);
334                         
335                         //is this a new name?
336                         if (name != prevName) {
337                                 prevName = name;
338                                 nameMap->push_back(name);
339                                 num++;
340                         }
341                 }
342         
343                 in.close();
344                 
345                 //write out names file
346                 //string outNames = getRootName(blastfile) + "names";
347                 //ofstream out;
348                 //openOutputFile(outNames, out);
349                 //nameMap->print(out);
350                 //out.close();
351                 if (m->control_pressed) { return 0; }
352                 
353                 m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
354                 
355                 return 0;
356                 
357         }
358         catch(exception& e) {
359                 m->errorOut(e, "ReadBlast", "readNames");
360                 exit(1);
361         }
362
363 /*********************************************************************************************/
364
365
366