5 * Created by westcott on 12/10/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "readblast.h"
11 #include "progress.hpp"
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareOverlap(seqDist left, seqDist right){
16 return (left.dist < right.dist);
18 /*********************************************************************************************/
19 ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
21 m = MothurOut::getInstance();
25 m->errorOut(e, "ReadBlast", "ReadBlast");
29 /*********************************************************************************************/
30 //assumptions about the blast file:
31 //1. if duplicate lines occur the first line is always best and is chosen
32 //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
33 int ReadBlast::read(NameAssignment* nameMap) {
36 //if the user has not given a names file read names from blastfile
37 if (nameMap->size() == 0) { readNames(nameMap); }
38 int nseqs = nameMap->size();
40 if (m->control_pressed) { return 0; }
43 openInputFile(blastfile, fileHandle);
45 string firstName, secondName, eScore, currentRow;
46 string repeatName = "";
48 float distance, thisoverlap, refScore;
50 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
55 //create objects needed for read
56 if (!hclusterWanted) {
57 matrix = new SparseMatrix();
59 overlapFile = getRootName(blastfile) + "overlap.dist";
60 distFile = getRootName(blastfile) + "hclusterDists.dist";
62 openOutputFile(overlapFile, outOverlap);
63 openOutputFile(distFile, outDist);
66 if (m->control_pressed) {
68 if (!hclusterWanted) { delete matrix; }
69 else { outOverlap.close(); remove(overlapFile.c_str()); outDist.close(); remove(distFile.c_str()); }
73 Progress* reading = new Progress("Reading blast: ", nseqs * nseqs);
75 //this is used to quickly find if we already have a distance for this combo
76 vector< map<int,float> > dists; dists.resize(nseqs); //dists[0][1] = distance from seq0 to seq1
77 map<int, float> thisRowsBlastScores;
79 if (!fileHandle.eof()) {
80 //read in line from file
81 fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
84 currentRow = firstName;
85 lengthThisSeq = numBases;
86 repeatName = firstName + secondName;
88 if (firstName == secondName) { refScore = score; }
90 //convert name to number
91 map<string,int>::iterator itA = nameMap->find(firstName);
92 map<string,int>::iterator itB = nameMap->find(secondName);
93 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
94 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
96 thisRowsBlastScores[itB->second] = score;
99 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
101 //if there is a valid overlap, add it
102 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
103 if (!hclusterWanted) {
104 seqDist overlapValue(itA->second, itB->second, thisoverlap);
105 overlap.push_back(overlapValue);
107 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
111 }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
115 while(!fileHandle.eof()){
117 if (m->control_pressed) {
119 if (!hclusterWanted) { delete matrix; }
120 else { outOverlap.close(); remove(overlapFile.c_str()); outDist.close(); remove(distFile.c_str()); }
125 //read in line from file
126 fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
127 //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;
130 string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
132 //if this is a new pairing
133 if (temp != repeatName) {
136 if (currentRow == firstName) {
137 //cout << "first = " << firstName << " second = " << secondName << endl;
138 if (firstName == secondName) {
140 reading->update((count + nseqs));
143 //convert name to number
144 map<string,int>::iterator itA = nameMap->find(firstName);
145 map<string,int>::iterator itB = nameMap->find(secondName);
146 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
147 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
150 thisRowsBlastScores[itB->second] = score;
153 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
155 //if there is a valid overlap, add it
156 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
157 if (!hclusterWanted) {
158 seqDist overlapValue(itA->second, itB->second, thisoverlap);
159 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
160 overlap.push_back(overlapValue);
162 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
167 //convert blast scores to distance and add cell to sparse matrix if we can
168 map<int, float>::iterator it;
169 map<int, float>::iterator itDist;
170 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
171 distance = 1.0 - (it->second / refScore);
173 //do we already have the distance calculated for b->a
174 map<string,int>::iterator itA = nameMap->find(currentRow);
175 itDist = dists[it->first].find(itA->second);
177 //if we have it then compare
178 if (itDist != dists[it->first].end()) {
179 //if you want the minimum blast score ratio, then pick max distance
180 if(minWanted) { distance = max(itDist->second, distance); }
181 else{ distance = min(itDist->second, distance); }
183 //is this distance below cutoff
184 if (distance < cutoff) {
185 if (!hclusterWanted) {
186 PCell value(itA->second, it->first, distance);
187 matrix->addCell(value);
189 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
192 //not going to need this again
193 dists[it->first].erase(itDist);
194 }else { //save this value until we get the other ratio
195 dists[itA->second][it->first] = distance;
198 //clear out last rows info
199 thisRowsBlastScores.clear();
201 currentRow = firstName;
202 lengthThisSeq = numBases;
204 //add this row to thisRowsBlastScores
205 if (firstName == secondName) { refScore = score; }
206 else{ //add this row to thisRowsBlastScores
208 //convert name to number
209 map<string,int>::iterator itA = nameMap->find(firstName);
210 map<string,int>::iterator itB = nameMap->find(secondName);
211 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
212 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
214 thisRowsBlastScores[itB->second] = score;
217 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
219 //if there is a valid overlap, add it
220 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
221 if (!hclusterWanted) {
222 seqDist overlapValue(itA->second, itB->second, thisoverlap);
223 overlap.push_back(overlapValue);
225 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
229 }//end if current row
233 //get last rows info stored
234 //convert blast scores to distance and add cell to sparse matrix if we can
235 map<int, float>::iterator it;
236 map<int, float>::iterator itDist;
237 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
238 distance = 1.0 - (it->second / refScore);
240 //do we already have the distance calculated for b->a
241 map<string,int>::iterator itA = nameMap->find(currentRow);
242 itDist = dists[it->first].find(itA->second);
244 //if we have it then compare
245 if (itDist != dists[it->first].end()) {
246 //if you want the minimum blast score ratio, then pick max distance
247 if(minWanted) { distance = max(itDist->second, distance); }
248 else{ distance = min(itDist->second, distance); }
250 //is this distance below cutoff
251 if (distance < cutoff) {
252 if (!hclusterWanted) {
253 PCell value(itA->second, it->first, distance);
254 matrix->addCell(value);
256 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
259 //not going to need this again
260 dists[it->first].erase(itDist);
261 }else { //save this value until we get the other ratio
262 dists[itA->second][it->first] = distance;
266 thisRowsBlastScores.clear();
269 if (m->control_pressed) {
271 if (!hclusterWanted) { delete matrix; }
272 else { outOverlap.close(); remove(overlapFile.c_str()); outDist.close(); remove(distFile.c_str()); }
277 if (!hclusterWanted) {
278 sort(overlap.begin(), overlap.end(), compareOverlap);
284 if (m->control_pressed) {
286 if (!hclusterWanted) { delete matrix; }
287 else { remove(overlapFile.c_str()); remove(distFile.c_str()); }
298 catch(exception& e) {
299 m->errorOut(e, "ReadBlast", "read");
303 /*********************************************************************************************/
304 int ReadBlast::readNames(NameAssignment* nameMap) {
306 m->mothurOut("Reading names... "); cout.flush();
308 string name, hold, prevName;
312 openInputFile(blastfile, in);
316 for (int i = 0; i < 11; i++) { in >> hold; }
319 //save name in nameMap
320 nameMap->push_back(prevName);
323 if (m->control_pressed) { in.close(); return 0; }
327 for (int i = 0; i < 11; i++) { in >> hold; }
330 //is this a new name?
331 if (name != prevName) {
333 nameMap->push_back(name);
340 //write out names file
341 //string outNames = getRootName(blastfile) + "names";
343 //openOutputFile(outNames, out);
344 //nameMap->print(out);
346 if (m->control_pressed) { return 0; }
348 m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
353 catch(exception& e) {
354 m->errorOut(e, "ReadBlast", "readNames");
358 /*********************************************************************************************/