]> git.donarmstrong.com Git - mothur.git/blob - readblast.cpp
changing command name classify.shared to classifyrf.shared
[mothur.git] / readblast.cpp
1 /*
2  *  readblast.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/10/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "readblast.h"
11 #include "progress.hpp"
12
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareOverlap(seqDist left, seqDist right){
16         return (left.dist < right.dist);        
17
18 /*********************************************************************************************/
19 ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
20         try {
21                 m = MothurOut::getInstance();
22                 matrix = NULL;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "ReadBlast", "ReadBlast");
26                 exit(1);
27         }
28
29 /*********************************************************************************************/
30 //assumptions about the blast file: 
31 //1. if duplicate lines occur the first line is always best and is chosen
32 //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
33 int ReadBlast::read(NameAssignment* nameMap) {
34         try {
35         
36                 //if the user has not given a names file read names from blastfile
37                 if (nameMap->size() == 0) { readNames(nameMap);  }
38                 int nseqs = nameMap->size();
39                 
40                 if (m->control_pressed) { return 0; }
41
42                 ifstream fileHandle;
43                 m->openInputFile(blastfile, fileHandle);
44                 
45                 string firstName, secondName, eScore, currentRow;
46                 string repeatName = "";
47                 int count = 1;
48                 float distance, thisoverlap, refScore;
49                 float percentId; 
50                 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
51                 
52                 ofstream outDist;
53                 ofstream outOverlap;
54                 
55                 //create objects needed for read
56                 if (!hclusterWanted) {
57                         matrix = new SparseDistanceMatrix();
58             matrix->resize(nseqs);
59                 }else{
60                         overlapFile = m->getRootName(blastfile) + "overlap.dist";
61                         distFile = m->getRootName(blastfile) + "hclusterDists.dist";
62                         
63                         m->openOutputFile(overlapFile, outOverlap);
64                         m->openOutputFile(distFile, outDist);
65                 }
66                 
67                 if (m->control_pressed) { 
68                         fileHandle.close();
69                         if (!hclusterWanted) {  delete matrix; }
70                         else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
71                         return 0;
72                 }
73                 
74                 Progress* reading = new Progress("Reading blast:     ", nseqs * nseqs);
75                 
76                 //this is used to quickly find if we already have a distance for this combo
77                 vector< map<int,float> > dists;  dists.resize(nseqs);  //dists[0][1] = distance from seq0 to seq1
78                 map<int, float> thisRowsBlastScores;
79                 
80                 if (!fileHandle.eof()) {
81                         //read in line from file
82                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
83                         m->gobble(fileHandle);
84                         
85                         currentRow = firstName;
86                         lengthThisSeq = numBases;
87                         repeatName = firstName + secondName;
88                         
89                         if (firstName == secondName) {   refScore = score;  }
90                         else{
91                                 //convert name to number
92                                 map<string,int>::iterator itA = nameMap->find(firstName);
93                                 map<string,int>::iterator itB = nameMap->find(secondName);
94                                 if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
95                                 if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
96                                 
97                                 thisRowsBlastScores[itB->second] = score;
98                                 
99                                 //calc overlap score
100                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
101                                 
102                                 //if there is a valid overlap, add it
103                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
104                                         if (!hclusterWanted) {
105                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
106                                                 overlap.push_back(overlapValue);
107                                         }else {
108                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
109                                         }
110                                 }
111                         }
112                 }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
113
114        
115                 //read file
116                 while(!fileHandle.eof()){  
117                 
118                         if (m->control_pressed) { 
119                                 fileHandle.close();
120                                 if (!hclusterWanted) {  delete matrix; }
121                                 else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
122                                 delete reading;
123                                 return 0;
124                         }
125                         
126                         //read in line from file
127                         fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
128                         //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;       
129                         m->gobble(fileHandle);
130                         
131                         string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
132                         
133                         //if this is a new pairing
134                         if (temp != repeatName) {
135                                 repeatName = temp; 
136                                 
137                                 if (currentRow == firstName) {
138                                         //cout << "first = " << firstName << " second = " << secondName << endl;        
139                                         if (firstName == secondName) { 
140                                                 refScore = score;  
141                                                 reading->update((count + nseqs));  
142                                                 count++; 
143                                         }else{
144                                                 //convert name to number
145                                                 map<string,int>::iterator itA = nameMap->find(firstName);
146                                                 map<string,int>::iterator itB = nameMap->find(secondName);
147                                                 if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
148                                                 if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
149                                                 
150                                                 //save score
151                                                 thisRowsBlastScores[itB->second] = score;
152                                                         
153                                                 //calc overlap score
154                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
155                                                 
156                                                 //if there is a valid overlap, add it
157                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
158                                                         if (!hclusterWanted) {
159                                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
160                                                                 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
161                                                                 overlap.push_back(overlapValue);
162                                                         }else {
163                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
164                                                         }
165                                                 }
166                                         } //end else
167                                 }else { //end row
168                                         //convert blast scores to distance and add cell to sparse matrix if we can
169                                         map<int, float>::iterator it;
170                                         map<int, float>::iterator itDist;
171                                         for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
172                                                 distance = 1.0 - (it->second / refScore);
173                 
174                                                 
175                                                 //do we already have the distance calculated for b->a
176                                                 map<string,int>::iterator itA = nameMap->find(currentRow);
177                                                 itDist = dists[it->first].find(itA->second);
178                                                 
179                                                 //if we have it then compare
180                                                 if (itDist != dists[it->first].end()) {
181         
182                                                         //if you want the minimum blast score ratio, then pick max distance
183                                                         if(minWanted) {  distance = max(itDist->second, distance);  }
184                                                         else{   distance = min(itDist->second, distance);  }
185
186                                                         //is this distance below cutoff
187                                                         if (distance < cutoff) {
188                                                                 if (!hclusterWanted) {
189                                     if (itA->second < it->first) {
190                                         PDistCell value(it->first, distance);
191                                         matrix->addCell(itA->second, value);
192                                     }else {
193                                         PDistCell value(itA->second, distance);
194                                         matrix->addCell(it->first, value);
195                                     }
196                                 }else{
197                                                                         outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
198                                                                 }
199                                                         }
200                                                         //not going to need this again
201                                                         dists[it->first].erase(itDist);
202                                                 }else { //save this value until we get the other ratio
203                                                         dists[itA->second][it->first] = distance;
204                                                 }
205                                         }
206                                         //clear out last rows info
207                                         thisRowsBlastScores.clear();
208                                         
209                                         currentRow = firstName;
210                                         lengthThisSeq = numBases;
211                                         
212                                         //add this row to thisRowsBlastScores
213                                         if (firstName == secondName) {   refScore = score;  }
214                                         else{ //add this row to thisRowsBlastScores
215                                                 
216                                                 //convert name to number
217                                                 map<string,int>::iterator itA = nameMap->find(firstName);
218                                                 map<string,int>::iterator itB = nameMap->find(secondName);
219                                                 if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
220                                                 if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
221                                                 
222                                                 thisRowsBlastScores[itB->second] = score;
223                                                 
224                                                 //calc overlap score
225                                                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
226                                                 
227                                                 //if there is a valid overlap, add it
228                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
229                                                         if (!hclusterWanted) {
230                                                                 seqDist overlapValue(itA->second, itB->second, thisoverlap);
231                                                                 overlap.push_back(overlapValue);
232                                                         }else {
233                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
234                                                         }
235                                                 }
236                                         }
237                                 }//end if current row
238                         }//end if repeat
239                 }//end while
240                 
241                 //get last rows info stored
242                 //convert blast scores to distance and add cell to sparse matrix if we can
243                 map<int, float>::iterator it;
244                 map<int, float>::iterator itDist;
245                 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
246                         distance = 1.0 - (it->second / refScore);
247                         
248                         //do we already have the distance calculated for b->a
249                         map<string,int>::iterator itA = nameMap->find(currentRow);
250                         itDist = dists[it->first].find(itA->second);
251                         
252                         //if we have it then compare
253                         if (itDist != dists[it->first].end()) {
254                                 //if you want the minimum blast score ratio, then pick max distance
255                                 if(minWanted) {  distance = max(itDist->second, distance);  }
256                                 else{   distance = min(itDist->second, distance);  }
257                                 
258                                 //is this distance below cutoff
259                                 if (distance < cutoff) {
260                                         if (!hclusterWanted) {
261                                                 if (itA->second < it->first) {
262                             PDistCell value(it->first, distance);
263                             matrix->addCell(itA->second, value);
264                         }else {
265                             PDistCell value(itA->second, distance);
266                             matrix->addCell(it->first, value);
267                         }
268                                         }else{
269                                                 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
270                                         }
271                                 }
272                                 //not going to need this again
273                                 dists[it->first].erase(itDist);
274                         }else { //save this value until we get the other ratio
275                                 dists[itA->second][it->first] = distance;
276                         }
277                 }
278                 //clear out info
279                 thisRowsBlastScores.clear();
280                 dists.clear();
281                 
282                 if (m->control_pressed) { 
283                                 fileHandle.close();
284                                 if (!hclusterWanted) {  delete matrix; }
285                                 else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
286                                 delete reading;
287                                 return 0;
288                 }
289                 
290                 if (!hclusterWanted) {
291                         sort(overlap.begin(), overlap.end(), compareOverlap);
292                 }else {
293                         outDist.close();
294                         outOverlap.close();
295                 }
296                 
297                 if (m->control_pressed) { 
298                                 fileHandle.close();
299                                 if (!hclusterWanted) {  delete matrix; }
300                                 else {  m->mothurRemove(overlapFile);  m->mothurRemove(distFile);  }
301                                 delete reading;
302                                 return 0;
303                 }
304                 
305                 reading->finish();
306                 delete reading;
307                 fileHandle.close();
308                 
309                 return 0;
310         }
311         catch(exception& e) {
312                 m->errorOut(e, "ReadBlast", "read");
313                 exit(1);
314         }
315
316 /*********************************************************************************************/
317 int ReadBlast::readNames(NameAssignment* nameMap) {
318         try {
319                 m->mothurOut("Reading names... "); cout.flush();
320                 
321                 string name, hold, prevName;
322                 int num = 1;
323                 
324                 ifstream in;
325                 m->openInputFile(blastfile, in);
326                 
327                 //ofstream outName;
328                 //m->openOutputFile((blastfile + ".tempOutNames"), outName);
329                 
330                 //read first line
331                 in >> prevName;
332         
333                 for (int i = 0; i < 11; i++) {  in >> hold;  }
334                 m->gobble(in);
335                                 
336                 //save name in nameMap
337                 nameMap->push_back(prevName);
338                 
339                 while (!in.eof()) {
340                         if (m->control_pressed) { in.close(); return 0; }
341                         
342                         //read line
343                         in >> name;
344         
345                         for (int i = 0; i < 11; i++) {  in >> hold;  }
346                         m->gobble(in);
347                         
348                         //is this a new name?
349                         if (name != prevName) {
350                                 prevName = name;
351                 
352                 if (nameMap->get(name) != -1) { m->mothurOut("[ERROR]: trying to exact names from blast file, and I found dups.  Are you sequence names unique? quitting.\n"); m->control_pressed = true; }
353                 else {
354                     nameMap->push_back(name);
355                 }
356                 //outName << name << '\t' << name << endl;
357                                 num++;
358                         }
359                 }
360         
361                 in.close();
362                 
363                 //write out names file
364                 //string outNames = m->getRootName(blastfile) + "names";
365                 //ofstream out;
366                 //m->openOutputFile(outNames, out);
367                 //nameMap->print(out);
368                 //out.close();
369                 
370                 if (m->control_pressed) { return 0; }
371                 
372                 m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
373                 
374                 return 0;
375                 
376         }
377         catch(exception& e) {
378                 m->errorOut(e, "ReadBlast", "readNames");
379                 exit(1);
380         }
381
382 /*********************************************************************************************/
383
384
385