5 * Created by westcott on 12/10/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "readblast.h"
11 #include "progress.hpp"
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareOverlap(seqDist left, seqDist right){
16 return (left.dist < right.dist);
18 /*********************************************************************************************/
19 ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
21 m = MothurOut::getInstance();
25 m->errorOut(e, "ReadBlast", "ReadBlast");
29 /*********************************************************************************************/
30 //assumptions about the blast file:
31 //1. if duplicate lines occur the first line is always best and is chosen
32 //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
33 int ReadBlast::read(NameAssignment* nameMap) {
36 //if the user has not given a names file read names from blastfile
37 if (nameMap->size() == 0) { readNames(nameMap); }
38 int nseqs = nameMap->size();
40 if (m->control_pressed) { return 0; }
43 m->openInputFile(blastfile, fileHandle);
45 string firstName, secondName, eScore, currentRow;
46 string repeatName = "";
48 float distance, thisoverlap, refScore;
50 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
55 //create objects needed for read
56 if (!hclusterWanted) {
57 matrix = new SparseDistanceMatrix();
58 matrix->resize(nseqs);
60 overlapFile = m->getRootName(blastfile) + "overlap.dist";
61 distFile = m->getRootName(blastfile) + "hclusterDists.dist";
63 m->openOutputFile(overlapFile, outOverlap);
64 m->openOutputFile(distFile, outDist);
67 if (m->control_pressed) {
69 if (!hclusterWanted) { delete matrix; }
70 else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile); }
74 Progress* reading = new Progress("Reading blast: ", nseqs * nseqs);
76 //this is used to quickly find if we already have a distance for this combo
77 vector< map<int,float> > dists; dists.resize(nseqs); //dists[0][1] = distance from seq0 to seq1
78 map<int, float> thisRowsBlastScores;
80 if (!fileHandle.eof()) {
81 //read in line from file
82 fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
83 m->gobble(fileHandle);
85 currentRow = firstName;
86 lengthThisSeq = numBases;
87 repeatName = firstName + secondName;
89 if (firstName == secondName) { refScore = score; }
91 //convert name to number
92 map<string,int>::iterator itA = nameMap->find(firstName);
93 map<string,int>::iterator itB = nameMap->find(secondName);
94 if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
95 if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
97 thisRowsBlastScores[itB->second] = score;
100 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
102 //if there is a valid overlap, add it
103 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
104 if (!hclusterWanted) {
105 seqDist overlapValue(itA->second, itB->second, thisoverlap);
106 overlap.push_back(overlapValue);
108 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
112 }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
113 string outDistFilem = "../kathryn/blastDist.dist";
115 m->openOutputFile(outDistFilem, outMDist);
117 while(!fileHandle.eof()){
119 if (m->control_pressed) {
121 if (!hclusterWanted) { delete matrix; }
122 else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile); }
127 //read in line from file
128 fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
129 //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;
130 m->gobble(fileHandle);
132 string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
134 //if this is a new pairing
135 if (temp != repeatName) {
138 if (currentRow == firstName) {
139 //cout << "first = " << firstName << " second = " << secondName << endl;
140 if (firstName == secondName) {
142 reading->update((count + nseqs));
145 //convert name to number
146 map<string,int>::iterator itA = nameMap->find(firstName);
147 map<string,int>::iterator itB = nameMap->find(secondName);
148 if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
149 if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
152 thisRowsBlastScores[itB->second] = score;
155 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
157 //if there is a valid overlap, add it
158 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
159 if (!hclusterWanted) {
160 seqDist overlapValue(itA->second, itB->second, thisoverlap);
161 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
162 overlap.push_back(overlapValue);
164 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
169 //convert blast scores to distance and add cell to sparse matrix if we can
170 map<int, float>::iterator it;
171 map<int, float>::iterator itDist;
172 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
173 distance = 1.0 - (it->second / refScore);
176 //do we already have the distance calculated for b->a
177 map<string,int>::iterator itA = nameMap->find(currentRow);
178 itDist = dists[it->first].find(itA->second);
180 //if we have it then compare
181 if (itDist != dists[it->first].end()) {
183 //if you want the minimum blast score ratio, then pick max distance
184 if(minWanted) { distance = max(itDist->second, distance); }
185 else{ distance = min(itDist->second, distance); }
187 //is this distance below cutoff
188 if (distance < cutoff) {
189 if (!hclusterWanted) {
190 if (itA->second < it->first) {
191 PDistCell value(it->first, distance);
192 matrix->addCell(itA->second, value);
194 PDistCell value(itA->second, distance);
195 matrix->addCell(it->first, value);
197 outMDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
199 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
202 //not going to need this again
203 dists[it->first].erase(itDist);
204 }else { //save this value until we get the other ratio
205 dists[itA->second][it->first] = distance;
208 //clear out last rows info
209 thisRowsBlastScores.clear();
211 currentRow = firstName;
212 lengthThisSeq = numBases;
214 //add this row to thisRowsBlastScores
215 if (firstName == secondName) { refScore = score; }
216 else{ //add this row to thisRowsBlastScores
218 //convert name to number
219 map<string,int>::iterator itA = nameMap->find(firstName);
220 map<string,int>::iterator itB = nameMap->find(secondName);
221 if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
222 if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
224 thisRowsBlastScores[itB->second] = score;
227 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
229 //if there is a valid overlap, add it
230 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
231 if (!hclusterWanted) {
232 seqDist overlapValue(itA->second, itB->second, thisoverlap);
233 overlap.push_back(overlapValue);
235 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
239 }//end if current row
243 //get last rows info stored
244 //convert blast scores to distance and add cell to sparse matrix if we can
245 map<int, float>::iterator it;
246 map<int, float>::iterator itDist;
247 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
248 distance = 1.0 - (it->second / refScore);
250 //do we already have the distance calculated for b->a
251 map<string,int>::iterator itA = nameMap->find(currentRow);
252 itDist = dists[it->first].find(itA->second);
254 //if we have it then compare
255 if (itDist != dists[it->first].end()) {
256 //if you want the minimum blast score ratio, then pick max distance
257 if(minWanted) { distance = max(itDist->second, distance); }
258 else{ distance = min(itDist->second, distance); }
260 //is this distance below cutoff
261 if (distance < cutoff) {
262 if (!hclusterWanted) {
263 if (itA->second < it->first) {
264 PDistCell value(it->first, distance);
265 matrix->addCell(itA->second, value);
267 PDistCell value(itA->second, distance);
268 matrix->addCell(it->first, value);
271 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
274 //not going to need this again
275 dists[it->first].erase(itDist);
276 }else { //save this value until we get the other ratio
277 dists[itA->second][it->first] = distance;
281 thisRowsBlastScores.clear();
284 if (m->control_pressed) {
286 if (!hclusterWanted) { delete matrix; }
287 else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile); }
292 if (!hclusterWanted) {
293 sort(overlap.begin(), overlap.end(), compareOverlap);
299 if (m->control_pressed) {
301 if (!hclusterWanted) { delete matrix; }
302 else { m->mothurRemove(overlapFile); m->mothurRemove(distFile); }
313 catch(exception& e) {
314 m->errorOut(e, "ReadBlast", "read");
318 /*********************************************************************************************/
319 int ReadBlast::readNames(NameAssignment* nameMap) {
321 m->mothurOut("Reading names... "); cout.flush();
323 string name, hold, prevName;
327 m->openInputFile(blastfile, in);
330 m->openOutputFile((blastfile + ".tempOutNames"), outName);
335 for (int i = 0; i < 11; i++) { in >> hold; }
338 //save name in nameMap
339 nameMap->push_back(prevName);
342 if (m->control_pressed) { in.close(); return 0; }
347 for (int i = 0; i < 11; i++) { in >> hold; }
350 //is this a new name?
351 if (name != prevName) {
353 nameMap->push_back(name);
354 outName << name << '\t' << name << endl;
361 //write out names file
362 //string outNames = m->getRootName(blastfile) + "names";
364 //m->openOutputFile(outNames, out);
365 //nameMap->print(out);
368 if (m->control_pressed) { return 0; }
370 m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
375 catch(exception& e) {
376 m->errorOut(e, "ReadBlast", "readNames");
380 /*********************************************************************************************/