5 * Created by westcott on 12/10/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "readblast.h"
11 #include "progress.hpp"
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareOverlap(DistNode left, DistNode right){
16 return (left.dist < right.dist);
18 /*********************************************************************************************/
19 ReadBlast::ReadBlast(string file, float c, float p, int l, bool m, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(m), hclusterWanted(h) {
24 errorOut(e, "ReadBlast", "ReadBlast");
28 /*********************************************************************************************/
29 //assumptions about the blast file:
30 //1. if duplicate lines occur the first line is always best and is chosen
31 //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
32 void ReadBlast::read(NameAssignment* nameMap) {
35 //if the user has not given a names file read names from blastfile
36 if (nameMap->size() == 0) { readNames(nameMap); }
37 int nseqs = nameMap->size();
40 openInputFile(blastfile, fileHandle);
42 string firstName, secondName, eScore, currentRow;
43 string repeatName = "";
45 float distance, thisoverlap, refScore;
47 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
52 //create objects needed for read
53 if (!hclusterWanted) {
54 matrix = new SparseMatrix();
56 overlapFile = getRootName(blastfile) + "overlap.dist";
57 distFile = getRootName(blastfile) + "hclusterDists.dist";
59 openOutputFile(overlapFile, outOverlap);
60 openOutputFile(distFile, outDist);
63 Progress* reading = new Progress("Reading blast: ", nseqs * nseqs);
65 //this is used to quickly find if we already have a distance for this combo
66 vector< map<int,float> > dists; dists.resize(nseqs); //dists[0][1] = distance from seq0 to seq1
67 map<int, float> thisRowsBlastScores;
69 if (!fileHandle.eof()) {
70 //read in line from file
71 fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
74 currentRow = firstName;
75 lengthThisSeq = numBases;
76 repeatName = firstName + secondName;
78 if (firstName == secondName) { refScore = score; }
80 //convert name to number
81 map<string,int>::iterator itA = nameMap->find(firstName);
82 map<string,int>::iterator itB = nameMap->find(secondName);
83 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
84 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
86 thisRowsBlastScores[itB->second] = score;
89 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
91 //if there is a valid overlap, add it
92 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
93 if (!hclusterWanted) {
94 DistNode overlapValue(itA->second, itB->second, thisoverlap);
95 overlap.push_back(overlapValue);
97 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
101 }else { mothurOut("Error in your blast file, cannot read."); mothurOutEndLine(); exit(1); }
105 while(!fileHandle.eof()){
107 //read in line from file
108 fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
109 //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;
112 string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
114 //if this is a new pairing
115 if (temp != repeatName) {
118 if (currentRow == firstName) {
119 //cout << "first = " << firstName << " second = " << secondName << endl;
120 if (firstName == secondName) {
122 reading->update((count + nseqs));
125 //convert name to number
126 map<string,int>::iterator itA = nameMap->find(firstName);
127 map<string,int>::iterator itB = nameMap->find(secondName);
128 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
129 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
132 thisRowsBlastScores[itB->second] = score;
135 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
137 //if there is a valid overlap, add it
138 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
139 if (!hclusterWanted) {
140 DistNode overlapValue(itA->second, itB->second, thisoverlap);
141 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
142 overlap.push_back(overlapValue);
144 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
149 //convert blast scores to distance and add cell to sparse matrix if we can
150 map<int, float>::iterator it;
151 map<int, float>::iterator itDist;
152 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
153 distance = 1.0 - (it->second / refScore);
155 //do we already have the distance calculated for b->a
156 map<string,int>::iterator itA = nameMap->find(currentRow);
157 itDist = dists[it->first].find(itA->second);
159 //if we have it then compare
160 if (itDist != dists[it->first].end()) {
161 //if you want the minimum blast score ratio, then pick max distance
162 if(minWanted) { distance = max(itDist->second, distance); }
163 else{ distance = min(itDist->second, distance); }
165 //is this distance below cutoff
166 if (distance < cutoff) {
167 if (!hclusterWanted) {
168 PCell value(itA->second, it->first, distance);
169 matrix->addCell(value);
171 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
174 //not going to need this again
175 dists[it->first].erase(itDist);
176 }else { //save this value until we get the other ratio
177 dists[itA->second][it->first] = distance;
180 //clear out last rows info
181 thisRowsBlastScores.clear();
183 currentRow = firstName;
184 lengthThisSeq = numBases;
186 //add this row to thisRowsBlastScores
187 if (firstName == secondName) { refScore = score; }
188 else{ //add this row to thisRowsBlastScores
190 //convert name to number
191 map<string,int>::iterator itA = nameMap->find(firstName);
192 map<string,int>::iterator itB = nameMap->find(secondName);
193 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
194 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
196 thisRowsBlastScores[itB->second] = score;
199 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
201 //if there is a valid overlap, add it
202 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
203 if (!hclusterWanted) {
204 DistNode overlapValue(itA->second, itB->second, thisoverlap);
205 overlap.push_back(overlapValue);
207 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
211 }//end if current row
215 //get last rows info stored
216 //convert blast scores to distance and add cell to sparse matrix if we can
217 map<int, float>::iterator it;
218 map<int, float>::iterator itDist;
219 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
220 distance = 1.0 - (it->second / refScore);
222 //do we already have the distance calculated for b->a
223 map<string,int>::iterator itA = nameMap->find(currentRow);
224 itDist = dists[it->first].find(itA->second);
226 //if we have it then compare
227 if (itDist != dists[it->first].end()) {
228 //if you want the minimum blast score ratio, then pick max distance
229 if(minWanted) { distance = max(itDist->second, distance); }
230 else{ distance = min(itDist->second, distance); }
232 //is this distance below cutoff
233 if (distance < cutoff) {
234 if (!hclusterWanted) {
235 PCell value(itA->second, it->first, distance);
236 matrix->addCell(value);
238 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
241 //not going to need this again
242 dists[it->first].erase(itDist);
243 }else { //save this value until we get the other ratio
244 dists[itA->second][it->first] = distance;
248 thisRowsBlastScores.clear();
251 if (!hclusterWanted) {
252 sort(overlap.begin(), overlap.end(), compareOverlap);
262 catch(exception& e) {
263 errorOut(e, "ReadBlast", "read");
267 /*********************************************************************************************/
268 void ReadBlast::readNames(NameAssignment* nameMap) {
270 mothurOut("Reading names... "); cout.flush();
272 string name, hold, prevName;
276 openInputFile(blastfile, in);
280 for (int i = 0; i < 11; i++) { in >> hold; }
283 //save name in nameMap
284 nameMap->push_back(prevName);
290 for (int i = 0; i < 11; i++) { in >> hold; }
293 //is this a new name?
294 if (name != prevName) {
296 nameMap->push_back(name);
303 //write out names file
304 //string outNames = getRootName(blastfile) + "names";
306 //openOutputFile(outNames, out);
307 //nameMap->print(out);
310 mothurOut(toString(num) + " names read."); mothurOutEndLine();
313 catch(exception& e) {
314 errorOut(e, "ReadBlast", "readNames");
318 /*********************************************************************************************/