5 * Created by westcott on 12/10/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "readblast.h"
11 #include "progress.hpp"
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareOverlap(seqDist left, seqDist right){
16 return (left.dist < right.dist);
18 /*********************************************************************************************/
19 ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
21 m = MothurOut::getInstance();
25 m->errorOut(e, "ReadBlast", "ReadBlast");
29 /*********************************************************************************************/
30 //assumptions about the blast file:
31 //1. if duplicate lines occur the first line is always best and is chosen
32 //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
33 void ReadBlast::read(NameAssignment* nameMap) {
36 //if the user has not given a names file read names from blastfile
37 if (nameMap->size() == 0) { readNames(nameMap); }
38 int nseqs = nameMap->size();
41 openInputFile(blastfile, fileHandle);
43 string firstName, secondName, eScore, currentRow;
44 string repeatName = "";
46 float distance, thisoverlap, refScore;
48 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
53 //create objects needed for read
54 if (!hclusterWanted) {
55 matrix = new SparseMatrix();
57 overlapFile = getRootName(blastfile) + "overlap.dist";
58 distFile = getRootName(blastfile) + "hclusterDists.dist";
60 openOutputFile(overlapFile, outOverlap);
61 openOutputFile(distFile, outDist);
64 Progress* reading = new Progress("Reading blast: ", nseqs * nseqs);
66 //this is used to quickly find if we already have a distance for this combo
67 vector< map<int,float> > dists; dists.resize(nseqs); //dists[0][1] = distance from seq0 to seq1
68 map<int, float> thisRowsBlastScores;
70 if (!fileHandle.eof()) {
71 //read in line from file
72 fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
75 currentRow = firstName;
76 lengthThisSeq = numBases;
77 repeatName = firstName + secondName;
79 if (firstName == secondName) { refScore = score; }
81 //convert name to number
82 map<string,int>::iterator itA = nameMap->find(firstName);
83 map<string,int>::iterator itB = nameMap->find(secondName);
84 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
85 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
87 thisRowsBlastScores[itB->second] = score;
90 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
92 //if there is a valid overlap, add it
93 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
94 if (!hclusterWanted) {
95 seqDist overlapValue(itA->second, itB->second, thisoverlap);
96 overlap.push_back(overlapValue);
98 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
102 }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
106 while(!fileHandle.eof()){
108 //read in line from file
109 fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
110 //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;
113 string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
115 //if this is a new pairing
116 if (temp != repeatName) {
119 if (currentRow == firstName) {
120 //cout << "first = " << firstName << " second = " << secondName << endl;
121 if (firstName == secondName) {
123 reading->update((count + nseqs));
126 //convert name to number
127 map<string,int>::iterator itA = nameMap->find(firstName);
128 map<string,int>::iterator itB = nameMap->find(secondName);
129 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
130 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
133 thisRowsBlastScores[itB->second] = score;
136 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
138 //if there is a valid overlap, add it
139 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
140 if (!hclusterWanted) {
141 seqDist overlapValue(itA->second, itB->second, thisoverlap);
142 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
143 overlap.push_back(overlapValue);
145 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
150 //convert blast scores to distance and add cell to sparse matrix if we can
151 map<int, float>::iterator it;
152 map<int, float>::iterator itDist;
153 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
154 distance = 1.0 - (it->second / refScore);
156 //do we already have the distance calculated for b->a
157 map<string,int>::iterator itA = nameMap->find(currentRow);
158 itDist = dists[it->first].find(itA->second);
160 //if we have it then compare
161 if (itDist != dists[it->first].end()) {
162 //if you want the minimum blast score ratio, then pick max distance
163 if(minWanted) { distance = max(itDist->second, distance); }
164 else{ distance = min(itDist->second, distance); }
166 //is this distance below cutoff
167 if (distance < cutoff) {
168 if (!hclusterWanted) {
169 PCell value(itA->second, it->first, distance);
170 matrix->addCell(value);
172 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
175 //not going to need this again
176 dists[it->first].erase(itDist);
177 }else { //save this value until we get the other ratio
178 dists[itA->second][it->first] = distance;
181 //clear out last rows info
182 thisRowsBlastScores.clear();
184 currentRow = firstName;
185 lengthThisSeq = numBases;
187 //add this row to thisRowsBlastScores
188 if (firstName == secondName) { refScore = score; }
189 else{ //add this row to thisRowsBlastScores
191 //convert name to number
192 map<string,int>::iterator itA = nameMap->find(firstName);
193 map<string,int>::iterator itB = nameMap->find(secondName);
194 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
195 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
197 thisRowsBlastScores[itB->second] = score;
200 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
202 //if there is a valid overlap, add it
203 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
204 if (!hclusterWanted) {
205 seqDist overlapValue(itA->second, itB->second, thisoverlap);
206 overlap.push_back(overlapValue);
208 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
212 }//end if current row
216 //get last rows info stored
217 //convert blast scores to distance and add cell to sparse matrix if we can
218 map<int, float>::iterator it;
219 map<int, float>::iterator itDist;
220 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
221 distance = 1.0 - (it->second / refScore);
223 //do we already have the distance calculated for b->a
224 map<string,int>::iterator itA = nameMap->find(currentRow);
225 itDist = dists[it->first].find(itA->second);
227 //if we have it then compare
228 if (itDist != dists[it->first].end()) {
229 //if you want the minimum blast score ratio, then pick max distance
230 if(minWanted) { distance = max(itDist->second, distance); }
231 else{ distance = min(itDist->second, distance); }
233 //is this distance below cutoff
234 if (distance < cutoff) {
235 if (!hclusterWanted) {
236 PCell value(itA->second, it->first, distance);
237 matrix->addCell(value);
239 outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
242 //not going to need this again
243 dists[it->first].erase(itDist);
244 }else { //save this value until we get the other ratio
245 dists[itA->second][it->first] = distance;
249 thisRowsBlastScores.clear();
252 if (!hclusterWanted) {
253 sort(overlap.begin(), overlap.end(), compareOverlap);
263 catch(exception& e) {
264 m->errorOut(e, "ReadBlast", "read");
268 /*********************************************************************************************/
269 void ReadBlast::readNames(NameAssignment* nameMap) {
271 m->mothurOut("Reading names... "); cout.flush();
273 string name, hold, prevName;
277 openInputFile(blastfile, in);
281 for (int i = 0; i < 11; i++) { in >> hold; }
284 //save name in nameMap
285 nameMap->push_back(prevName);
291 for (int i = 0; i < 11; i++) { in >> hold; }
294 //is this a new name?
295 if (name != prevName) {
297 nameMap->push_back(name);
304 //write out names file
305 //string outNames = getRootName(blastfile) + "names";
307 //openOutputFile(outNames, out);
308 //nameMap->print(out);
311 m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
314 catch(exception& e) {
315 m->errorOut(e, "ReadBlast", "readNames");
319 /*********************************************************************************************/