--- /dev/null
+/*
+ * readblast.cpp
+ * Mothur
+ *
+ * Created by westcott on 12/10/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "readblast.h"
+#include "progress.hpp"
+
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareOverlap(seqDist left, seqDist right){
+ return (left.dist < right.dist);
+}
+/*********************************************************************************************/
+ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
+ try {
+ m = MothurOut::getInstance();
+ matrix = NULL;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ReadBlast", "ReadBlast");
+ exit(1);
+ }
+}
+/*********************************************************************************************/
+//assumptions about the blast file:
+//1. if duplicate lines occur the first line is always best and is chosen
+//2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
+int ReadBlast::read(NameAssignment* nameMap) {
+ try {
+
+ //if the user has not given a names file read names from blastfile
+ if (nameMap->size() == 0) { readNames(nameMap); }
+ int nseqs = nameMap->size();
+
+ if (m->control_pressed) { return 0; }
+
+ ifstream fileHandle;
+ m->openInputFile(blastfile, fileHandle);
+
+ string firstName, secondName, eScore, currentRow;
+ string repeatName = "";
+ int count = 1;
+ float distance, thisoverlap, refScore;
+ float percentId;
+ float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
+
+ ofstream outDist;
+ ofstream outOverlap;
+
+ //create objects needed for read
+ if (!hclusterWanted) {
+ matrix = new SparseMatrix();
+ }else{
+ overlapFile = m->getRootName(blastfile) + "overlap.dist";
+ distFile = m->getRootName(blastfile) + "hclusterDists.dist";
+
+ m->openOutputFile(overlapFile, outOverlap);
+ m->openOutputFile(distFile, outDist);
+ }
+
+ if (m->control_pressed) {
+ fileHandle.close();
+ if (!hclusterWanted) { delete matrix; }
+ else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile); }
+ return 0;
+ }
+
+ Progress* reading = new Progress("Reading blast: ", nseqs * nseqs);
+
+ //this is used to quickly find if we already have a distance for this combo
+ vector< map<int,float> > dists; dists.resize(nseqs); //dists[0][1] = distance from seq0 to seq1
+ map<int, float> thisRowsBlastScores;
+
+ if (!fileHandle.eof()) {
+ //read in line from file
+ fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+ m->gobble(fileHandle);
+
+ currentRow = firstName;
+ lengthThisSeq = numBases;
+ repeatName = firstName + secondName;
+
+ if (firstName == secondName) { refScore = score; }
+ else{
+ //convert name to number
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
+ if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
+
+ thisRowsBlastScores[itB->second] = score;
+
+ //calc overlap score
+ thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+
+ //if there is a valid overlap, add it
+ if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+ if (!hclusterWanted) {
+ seqDist overlapValue(itA->second, itB->second, thisoverlap);
+ overlap.push_back(overlapValue);
+ }else {
+ outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+ }
+ }
+ }
+ }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
+
+
+ //read file
+ while(!fileHandle.eof()){
+
+ if (m->control_pressed) {
+ fileHandle.close();
+ if (!hclusterWanted) { delete matrix; }
+ else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile); }
+ delete reading;
+ return 0;
+ }
+
+ //read in line from file
+ fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+ //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;
+ m->gobble(fileHandle);
+
+ string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
+
+ //if this is a new pairing
+ if (temp != repeatName) {
+ repeatName = temp;
+
+ if (currentRow == firstName) {
+ //cout << "first = " << firstName << " second = " << secondName << endl;
+ if (firstName == secondName) {
+ refScore = score;
+ reading->update((count + nseqs));
+ count++;
+ }else{
+ //convert name to number
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
+ if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
+
+ //save score
+ thisRowsBlastScores[itB->second] = score;
+
+ //calc overlap score
+ thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+
+ //if there is a valid overlap, add it
+ if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+ if (!hclusterWanted) {
+ seqDist overlapValue(itA->second, itB->second, thisoverlap);
+ //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
+ overlap.push_back(overlapValue);
+ }else {
+ outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+ }
+ }
+ } //end else
+ }else { //end row
+ //convert blast scores to distance and add cell to sparse matrix if we can
+ map<int, float>::iterator it;
+ map<int, float>::iterator itDist;
+ for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
+ distance = 1.0 - (it->second / refScore);
+
+
+ //do we already have the distance calculated for b->a
+ map<string,int>::iterator itA = nameMap->find(currentRow);
+ itDist = dists[it->first].find(itA->second);
+
+ //if we have it then compare
+ if (itDist != dists[it->first].end()) {
+
+ //if you want the minimum blast score ratio, then pick max distance
+ if(minWanted) { distance = max(itDist->second, distance); }
+ else{ distance = min(itDist->second, distance); }
+
+ //is this distance below cutoff
+ if (distance < cutoff) {
+ if (!hclusterWanted) {
+ PCell value(itA->second, it->first, distance);
+ matrix->addCell(value);
+ }else{
+ outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
+ }
+ }
+ //not going to need this again
+ dists[it->first].erase(itDist);
+ }else { //save this value until we get the other ratio
+ dists[itA->second][it->first] = distance;
+ }
+ }
+ //clear out last rows info
+ thisRowsBlastScores.clear();
+
+ currentRow = firstName;
+ lengthThisSeq = numBases;
+
+ //add this row to thisRowsBlastScores
+ if (firstName == secondName) { refScore = score; }
+ else{ //add this row to thisRowsBlastScores
+
+ //convert name to number
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
+ if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
+
+ thisRowsBlastScores[itB->second] = score;
+
+ //calc overlap score
+ thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+
+ //if there is a valid overlap, add it
+ if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+ if (!hclusterWanted) {
+ seqDist overlapValue(itA->second, itB->second, thisoverlap);
+ overlap.push_back(overlapValue);
+ }else {
+ outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+ }
+ }
+ }
+ }//end if current row
+ }//end if repeat
+ }//end while
+
+ //get last rows info stored
+ //convert blast scores to distance and add cell to sparse matrix if we can
+ map<int, float>::iterator it;
+ map<int, float>::iterator itDist;
+ for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
+ distance = 1.0 - (it->second / refScore);
+
+ //do we already have the distance calculated for b->a
+ map<string,int>::iterator itA = nameMap->find(currentRow);
+ itDist = dists[it->first].find(itA->second);
+
+ //if we have it then compare
+ if (itDist != dists[it->first].end()) {
+ //if you want the minimum blast score ratio, then pick max distance
+ if(minWanted) { distance = max(itDist->second, distance); }
+ else{ distance = min(itDist->second, distance); }
+
+ //is this distance below cutoff
+ if (distance < cutoff) {
+ if (!hclusterWanted) {
+ PCell value(itA->second, it->first, distance);
+ matrix->addCell(value);
+ }else{
+ outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
+ }
+ }
+ //not going to need this again
+ dists[it->first].erase(itDist);
+ }else { //save this value until we get the other ratio
+ dists[itA->second][it->first] = distance;
+ }
+ }
+ //clear out info
+ thisRowsBlastScores.clear();
+ dists.clear();
+
+ if (m->control_pressed) {
+ fileHandle.close();
+ if (!hclusterWanted) { delete matrix; }
+ else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile); }
+ delete reading;
+ return 0;
+ }
+
+ if (!hclusterWanted) {
+ sort(overlap.begin(), overlap.end(), compareOverlap);
+ }else {
+ outDist.close();
+ outOverlap.close();
+ }
+
+ if (m->control_pressed) {
+ fileHandle.close();
+ if (!hclusterWanted) { delete matrix; }
+ else { m->mothurRemove(overlapFile); m->mothurRemove(distFile); }
+ delete reading;
+ return 0;
+ }
+
+ reading->finish();
+ delete reading;
+ fileHandle.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ReadBlast", "read");
+ exit(1);
+ }
+}
+/*********************************************************************************************/
+int ReadBlast::readNames(NameAssignment* nameMap) {
+ try {
+ m->mothurOut("Reading names... "); cout.flush();
+
+ string name, hold, prevName;
+ int num = 1;
+
+ ifstream in;
+ m->openInputFile(blastfile, in);
+
+ //ofstream outName;
+ //m->openOutputFile((blastfile + ".tempOutNames"), outName);
+
+ //read first line
+ in >> prevName;
+
+ for (int i = 0; i < 11; i++) { in >> hold; }
+ m->gobble(in);
+
+ //save name in nameMap
+ nameMap->push_back(prevName);
+
+ while (!in.eof()) {
+ if (m->control_pressed) { in.close(); return 0; }
+
+ //read line
+ in >> name;
+
+ for (int i = 0; i < 11; i++) { in >> hold; }
+ m->gobble(in);
+
+ //is this a new name?
+ if (name != prevName) {
+ prevName = name;
+ nameMap->push_back(name);
+ num++;
+ }
+ }
+
+ in.close();
+
+ //write out names file
+ //string outNames = m->getRootName(blastfile) + "names";
+ //ofstream out;
+ //m->openOutputFile(outNames, out);
+ //nameMap->print(out);
+ //out.close();
+
+ if (m->control_pressed) { return 0; }
+
+ m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ReadBlast", "readNames");
+ exit(1);
+ }
+}
+/*********************************************************************************************/
+
+
+