X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=readblast.cpp;fp=readblast.cpp;h=7a092f96cc8b92cd245decd37d388034737a73ed;hp=0000000000000000000000000000000000000000;hb=c82900be3ceed3d9bc491bdc98b1819ef85c1af7;hpb=a5afca18544555fba2d9c3670ad1f8574916b0a0 diff --git a/readblast.cpp b/readblast.cpp new file mode 100644 index 0000000..7a092f9 --- /dev/null +++ b/readblast.cpp @@ -0,0 +1,321 @@ +/* + * readblast.cpp + * Mothur + * + * Created by westcott on 12/10/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "readblast.h" +#include "progress.hpp" + +//******************************************************************************************************************** +//sorts lowest to highest +inline bool compareOverlap(DistNode left, DistNode right){ + return (left.dist < right.dist); +} +/*********************************************************************************************/ +ReadBlast::ReadBlast(string file, float c, float p, int l, bool m, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(m), hclusterWanted(h) { + try { + matrix = NULL; + } + catch(exception& e) { + errorOut(e, "ReadBlast", "ReadBlast"); + exit(1); + } +} +/*********************************************************************************************/ +//assumptions about the blast file: +//1. if duplicate lines occur the first line is always best and is chosen +//2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score... +void ReadBlast::read(NameAssignment* nameMap) { + try { + + //if the user has not given a names file read names from blastfile + if (nameMap->size() == 0) { readNames(nameMap); } + int nseqs = nameMap->size(); + + ifstream fileHandle; + openInputFile(blastfile, fileHandle); + + string firstName, secondName, eScore, currentRow; + string repeatName = ""; + int count = 1; + float distance, thisoverlap, refScore; + float percentId; + float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq; + + ofstream outDist; + ofstream outOverlap; + + //create objects needed for read + if (!hclusterWanted) { + matrix = new SparseMatrix(); + }else{ + overlapFile = getRootName(blastfile) + "overlap.dist"; + distFile = getRootName(blastfile) + "hclusterDists.dist"; + + openOutputFile(overlapFile, outOverlap); + openOutputFile(distFile, outDist); + } + + Progress* reading = new Progress("Reading blast: ", nseqs * nseqs); + + //this is used to quickly find if we already have a distance for this combo + vector< map > dists; dists.resize(nseqs); //dists[0][1] = distance from seq0 to seq1 + map thisRowsBlastScores; + + if (!fileHandle.eof()) { + //read in line from file + fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score; + gobble(fileHandle); + + currentRow = firstName; + lengthThisSeq = numBases; + repeatName = firstName + secondName; + + if (firstName == secondName) { refScore = score; } + else{ + //convert name to number + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + thisRowsBlastScores[itB->second] = score; + + //calc overlap score + thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty); + + //if there is a valid overlap, add it + if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { + if (!hclusterWanted) { + DistNode overlapValue(itA->second, itB->second, thisoverlap); + overlap.push_back(overlapValue); + }else { + outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl; + } + } + } + }else { mothurOut("Error in your blast file, cannot read."); mothurOutEndLine(); exit(1); } + + + //read file + while(!fileHandle.eof()){ + + //read in line from file + fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score; + //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl; + gobble(fileHandle); + + string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file + + //if this is a new pairing + if (temp != repeatName) { + repeatName = temp; + + if (currentRow == firstName) { + //cout << "first = " << firstName << " second = " << secondName << endl; + if (firstName == secondName) { + refScore = score; + reading->update((count + nseqs)); + count++; + }else{ + //convert name to number + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + //save score + thisRowsBlastScores[itB->second] = score; + + //calc overlap score + thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty); + + //if there is a valid overlap, add it + if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { + if (!hclusterWanted) { + DistNode overlapValue(itA->second, itB->second, thisoverlap); + //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl; + overlap.push_back(overlapValue); + }else { + outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl; + } + } + } //end else + }else { //end row + //convert blast scores to distance and add cell to sparse matrix if we can + map::iterator it; + map::iterator itDist; + for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) { + distance = 1.0 - (it->second / refScore); + + //do we already have the distance calculated for b->a + map::iterator itA = nameMap->find(currentRow); + itDist = dists[it->first].find(itA->second); + + //if we have it then compare + if (itDist != dists[it->first].end()) { + //if you want the minimum blast score ratio, then pick max distance + if(minWanted) { distance = max(itDist->second, distance); } + else{ distance = min(itDist->second, distance); } + + //is this distance below cutoff + if (distance < cutoff) { + if (!hclusterWanted) { + PCell value(itA->second, it->first, distance); + matrix->addCell(value); + }else{ + outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl; + } + } + //not going to need this again + dists[it->first].erase(itDist); + }else { //save this value until we get the other ratio + dists[itA->second][it->first] = distance; + } + } + //clear out last rows info + thisRowsBlastScores.clear(); + + currentRow = firstName; + lengthThisSeq = numBases; + + //add this row to thisRowsBlastScores + if (firstName == secondName) { refScore = score; } + else{ //add this row to thisRowsBlastScores + + //convert name to number + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + thisRowsBlastScores[itB->second] = score; + + //calc overlap score + thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty); + + //if there is a valid overlap, add it + if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { + if (!hclusterWanted) { + DistNode overlapValue(itA->second, itB->second, thisoverlap); + overlap.push_back(overlapValue); + }else { + outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl; + } + } + } + }//end if current row + }//end if repeat + }//end while + + //get last rows info stored + //convert blast scores to distance and add cell to sparse matrix if we can + map::iterator it; + map::iterator itDist; + for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) { + distance = 1.0 - (it->second / refScore); + + //do we already have the distance calculated for b->a + map::iterator itA = nameMap->find(currentRow); + itDist = dists[it->first].find(itA->second); + + //if we have it then compare + if (itDist != dists[it->first].end()) { + //if you want the minimum blast score ratio, then pick max distance + if(minWanted) { distance = max(itDist->second, distance); } + else{ distance = min(itDist->second, distance); } + + //is this distance below cutoff + if (distance < cutoff) { + if (!hclusterWanted) { + PCell value(itA->second, it->first, distance); + matrix->addCell(value); + }else{ + outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl; + } + } + //not going to need this again + dists[it->first].erase(itDist); + }else { //save this value until we get the other ratio + dists[itA->second][it->first] = distance; + } + } + //clear out info + thisRowsBlastScores.clear(); + dists.clear(); + + if (!hclusterWanted) { + sort(overlap.begin(), overlap.end(), compareOverlap); + }else { + outDist.close(); + outOverlap.close(); + } + + reading->finish(); + delete reading; + fileHandle.close(); + } + catch(exception& e) { + errorOut(e, "ReadBlast", "read"); + exit(1); + } +} +/*********************************************************************************************/ +void ReadBlast::readNames(NameAssignment* nameMap) { + try { + mothurOut("Reading names... "); cout.flush(); + + string name, hold, prevName; + int num = 1; + + ifstream in; + openInputFile(blastfile, in); + + //read first line + in >> prevName; + for (int i = 0; i < 11; i++) { in >> hold; } + gobble(in); + + //save name in nameMap + nameMap->push_back(prevName); + + while (!in.eof()) { + + //read line + in >> name; + for (int i = 0; i < 11; i++) { in >> hold; } + gobble(in); + + //is this a new name? + if (name != prevName) { + prevName = name; + nameMap->push_back(name); + num++; + } + } + + in.close(); + + //write out names file + //string outNames = getRootName(blastfile) + "names"; + //ofstream out; + //openOutputFile(outNames, out); + //nameMap->print(out); + //out.close(); + + mothurOut(toString(num) + " names read."); mothurOutEndLine(); + + } + catch(exception& e) { + errorOut(e, "ReadBlast", "readNames"); + exit(1); + } +} +/*********************************************************************************************/ + + +