]> git.donarmstrong.com Git - mothur.git/blob - readblast.cpp
fixed outputName creator, added debugging to readblast
[mothur.git] / readblast.cpp
1 /*
2  *  readblast.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/10/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "readblast.h"
11 #include "progress.hpp"
12
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareOverlap(seqDist left, seqDist right){
16         return (left.dist < right.dist);        
17
18 /*********************************************************************************************/
19 ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
20         try {
21                 m = MothurOut::getInstance();
22                 matrix = NULL;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "ReadBlast", "ReadBlast");
26                 exit(1);
27         }
28
29 /*********************************************************************************************/
30 //assumptions about the blast file: 
31 //1. if duplicate lines occur the first line is always best and is chosen
32 //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
33 int ReadBlast::read(NameAssignment* nameMap) {
34         try {
35         
36                 //if the user has not given a names file read names from blastfile
37                 if (nameMap->size() == 0) { readNames(nameMap);  }
38                 int nseqs = nameMap->size();
39                 
40                 if (m->control_pressed) { return 0; }
41
42                 ifstream fileHandle;
43                 m->openInputFile(blastfile, fileHandle);
44                 
45                 string firstName, secondName, eScore, currentRow;
46                 string repeatName = "";
47                 int count = 1;
48                 float distance, thisoverlap, refScore;
49                 float percentId; 
50                 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
51                 
52                 ofstream outDist;
53                 ofstream outOverlap;
54                 
55                 //create objects needed for read
56                 if (!hclusterWanted) {
57                         matrix = new SparseDistanceMatrix();
58             matrix->resize(nseqs);
59                 }else{
60                         overlapFile = m->getRootName(blastfile) + "overlap.dist";
61                         distFile = m->getRootName(blastfile) + "hclusterDists.dist";
62                         
63                         m->openOutputFile(overlapFile, outOverlap);
64                         m->openOutputFile(distFile, outDist);
65                 }
66                 
67                 if (m->control_pressed) { 
68                         fileHandle.close();
69                         if (!hclusterWanted) {  delete matrix; }
70                         else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
71                         return 0;
72                 }
73                 
74                 Progress* reading = new Progress("Reading blast:     ", nseqs * nseqs);
75                 
76                 //this is used to quickly find if we already have a distance for this combo
77                 vector< map<int,float> > dists;  dists.resize(nseqs);  //dists[0][1] = distance from seq0 to seq1
78                 map<int, float> thisRowsBlastScores;
79                 
80                 if (!fileHandle.eof()) {
81                         //read in line from file
82                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
83                         m->gobble(fileHandle);
84                         
85                         currentRow = firstName;
86                         lengthThisSeq = numBases;
87                         repeatName = firstName + secondName;
88                         
89                         if (firstName == secondName) {   refScore = score;  }
90                         else{
91                                 //convert name to number
92                                 map<string,int>::iterator itA = nameMap->find(firstName);
93                                 map<string,int>::iterator itB = nameMap->find(secondName);
94                                 if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
95                                 if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
96                                 
97                                 thisRowsBlastScores[itB->second] = score;
98                                 
99                                 //calc overlap score
100                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
101                                 
102                                 //if there is a valid overlap, add it
103                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
104                                         if (!hclusterWanted) {
105                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
106                                                 overlap.push_back(overlapValue);
107                                         }else {
108                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
109                                         }
110                                 }
111                         }
112                 }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
113 string outDistFilem = "../kathryn/blastDist.dist";
114         ofstream outMDist;
115         m->openOutputFile(outDistFilem, outMDist);
116                 //read file
117                 while(!fileHandle.eof()){  
118                 
119                         if (m->control_pressed) { 
120                                 fileHandle.close();
121                                 if (!hclusterWanted) {  delete matrix; }
122                                 else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
123                                 delete reading;
124                                 return 0;
125                         }
126                         
127                         //read in line from file
128                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
129                         //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;       
130                         m->gobble(fileHandle);
131                         
132                         string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
133                         
134                         //if this is a new pairing
135                         if (temp != repeatName) {
136                                 repeatName = temp; 
137                                 
138                                 if (currentRow == firstName) {
139                                         //cout << "first = " << firstName << " second = " << secondName << endl;        
140                                         if (firstName == secondName) { 
141                                                 refScore = score;  
142                                                 reading->update((count + nseqs));  
143                                                 count++; 
144                                         }else{
145                                                 //convert name to number
146                                                 map<string,int>::iterator itA = nameMap->find(firstName);
147                                                 map<string,int>::iterator itB = nameMap->find(secondName);
148                                                 if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
149                                                 if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
150                                                 
151                                                 //save score
152                                                 thisRowsBlastScores[itB->second] = score;
153                                                         
154                                                 //calc overlap score
155                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
156                                                 
157                                                 //if there is a valid overlap, add it
158                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
159                                                         if (!hclusterWanted) {
160                                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
161                                                                 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
162                                                                 overlap.push_back(overlapValue);
163                                                         }else {
164                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
165                                                         }
166                                                 }
167                                         } //end else
168                                 }else { //end row
169                                         //convert blast scores to distance and add cell to sparse matrix if we can
170                                         map<int, float>::iterator it;
171                                         map<int, float>::iterator itDist;
172                                         for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
173                                                 distance = 1.0 - (it->second / refScore);
174                 
175                                                 
176                                                 //do we already have the distance calculated for b->a
177                                                 map<string,int>::iterator itA = nameMap->find(currentRow);
178                                                 itDist = dists[it->first].find(itA->second);
179                                                 
180                                                 //if we have it then compare
181                                                 if (itDist != dists[it->first].end()) {
182         
183                                                         //if you want the minimum blast score ratio, then pick max distance
184                                                         if(minWanted) {  distance = max(itDist->second, distance);  }
185                                                         else{   distance = min(itDist->second, distance);  }
186
187                                                         //is this distance below cutoff
188                                                         if (distance < cutoff) {
189                                                                 if (!hclusterWanted) {
190                                     if (itA->second < it->first) {
191                                         PDistCell value(it->first, distance);
192                                         matrix->addCell(itA->second, value);
193                                     }else {
194                                         PDistCell value(itA->second, distance);
195                                         matrix->addCell(it->first, value);
196                                     }
197                                     outMDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
198                                                                 }else{
199                                                                         outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
200                                                                 }
201                                                         }
202                                                         //not going to need this again
203                                                         dists[it->first].erase(itDist);
204                                                 }else { //save this value until we get the other ratio
205                                                         dists[itA->second][it->first] = distance;
206                                                 }
207                                         }
208                                         //clear out last rows info
209                                         thisRowsBlastScores.clear();
210                                         
211                                         currentRow = firstName;
212                                         lengthThisSeq = numBases;
213                                         
214                                         //add this row to thisRowsBlastScores
215                                         if (firstName == secondName) {   refScore = score;  }
216                                         else{ //add this row to thisRowsBlastScores
217                                                 
218                                                 //convert name to number
219                                                 map<string,int>::iterator itA = nameMap->find(firstName);
220                                                 map<string,int>::iterator itB = nameMap->find(secondName);
221                                                 if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
222                                                 if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
223                                                 
224                                                 thisRowsBlastScores[itB->second] = score;
225                                                 
226                                                 //calc overlap score
227                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
228                                                 
229                                                 //if there is a valid overlap, add it
230                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
231                                                         if (!hclusterWanted) {
232                                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
233                                                                 overlap.push_back(overlapValue);
234                                                         }else {
235                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
236                                                         }
237                                                 }
238                                         }
239                                 }//end if current row
240                         }//end if repeat
241                 }//end while
242                 
243                 //get last rows info stored
244                 //convert blast scores to distance and add cell to sparse matrix if we can
245                 map<int, float>::iterator it;
246                 map<int, float>::iterator itDist;
247                 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
248                         distance = 1.0 - (it->second / refScore);
249                         
250                         //do we already have the distance calculated for b->a
251                         map<string,int>::iterator itA = nameMap->find(currentRow);
252                         itDist = dists[it->first].find(itA->second);
253                         
254                         //if we have it then compare
255                         if (itDist != dists[it->first].end()) {
256                                 //if you want the minimum blast score ratio, then pick max distance
257                                 if(minWanted) {  distance = max(itDist->second, distance);  }
258                                 else{   distance = min(itDist->second, distance);  }
259                                 
260                                 //is this distance below cutoff
261                                 if (distance < cutoff) {
262                                         if (!hclusterWanted) {
263                                                 if (itA->second < it->first) {
264                             PDistCell value(it->first, distance);
265                             matrix->addCell(itA->second, value);
266                         }else {
267                             PDistCell value(itA->second, distance);
268                             matrix->addCell(it->first, value);
269                         }
270                                         }else{
271                                                 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
272                                         }
273                                 }
274                                 //not going to need this again
275                                 dists[it->first].erase(itDist);
276                         }else { //save this value until we get the other ratio
277                                 dists[itA->second][it->first] = distance;
278                         }
279                 }
280                 //clear out info
281                 thisRowsBlastScores.clear();
282                 dists.clear();
283                 
284                 if (m->control_pressed) { 
285                                 fileHandle.close();
286                                 if (!hclusterWanted) {  delete matrix; }
287                                 else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
288                                 delete reading;
289                                 return 0;
290                 }
291                 
292                 if (!hclusterWanted) {
293                         sort(overlap.begin(), overlap.end(), compareOverlap);
294                 }else {
295                         outDist.close();
296                         outOverlap.close();
297                 }
298                 
299                 if (m->control_pressed) { 
300                                 fileHandle.close();
301                                 if (!hclusterWanted) {  delete matrix; }
302                                 else {  m->mothurRemove(overlapFile);  m->mothurRemove(distFile);  }
303                                 delete reading;
304                                 return 0;
305                 }
306                 
307                 reading->finish();
308                 delete reading;
309                 fileHandle.close();
310                 
311                 return 0;
312         }
313         catch(exception& e) {
314                 m->errorOut(e, "ReadBlast", "read");
315                 exit(1);
316         }
317
318 /*********************************************************************************************/
319 int ReadBlast::readNames(NameAssignment* nameMap) {
320         try {
321                 m->mothurOut("Reading names... "); cout.flush();
322                 
323                 string name, hold, prevName;
324                 int num = 1;
325                 
326                 ifstream in;
327                 m->openInputFile(blastfile, in);
328                 
329                 ofstream outName;
330                 m->openOutputFile((blastfile + ".tempOutNames"), outName);
331                 
332                 //read first line
333                 in >> prevName;
334         
335                 for (int i = 0; i < 11; i++) {  in >> hold;  }
336                 m->gobble(in);
337                                 
338                 //save name in nameMap
339                 nameMap->push_back(prevName);
340                 
341                 while (!in.eof()) {
342                         if (m->control_pressed) { in.close(); return 0; }
343                         
344                         //read line
345                         in >> name;
346         
347                         for (int i = 0; i < 11; i++) {  in >> hold;  }
348                         m->gobble(in);
349                         
350                         //is this a new name?
351                         if (name != prevName) {
352                                 prevName = name;
353                                 nameMap->push_back(name);
354                 outName << name << '\t' << name << endl;
355                                 num++;
356                         }
357                 }
358         
359                 in.close();
360                 
361                 //write out names file
362                 //string outNames = m->getRootName(blastfile) + "names";
363                 //ofstream out;
364                 //m->openOutputFile(outNames, out);
365                 //nameMap->print(out);
366                 //out.close();
367                 
368                 if (m->control_pressed) { return 0; }
369                 
370                 m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
371                 
372                 return 0;
373                 
374         }
375         catch(exception& e) {
376                 m->errorOut(e, "ReadBlast", "readNames");
377                 exit(1);
378         }
379
380 /*********************************************************************************************/
381
382
383