]> git.donarmstrong.com Git - mothur.git/blobdiff - readblast.cpp
Revert to previous commit
[mothur.git] / readblast.cpp
diff --git a/readblast.cpp b/readblast.cpp
new file mode 100644 (file)
index 0000000..2d49477
--- /dev/null
@@ -0,0 +1,369 @@
+/*
+ *  readblast.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 12/10/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "readblast.h"
+#include "progress.hpp"
+
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareOverlap(seqDist left, seqDist right){
+       return (left.dist < right.dist);        
+} 
+/*********************************************************************************************/
+ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
+       try {
+               m = MothurOut::getInstance();
+               matrix = NULL;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ReadBlast", "ReadBlast");
+               exit(1);
+       }
+} 
+/*********************************************************************************************/
+//assumptions about the blast file: 
+//1. if duplicate lines occur the first line is always best and is chosen
+//2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
+int ReadBlast::read(NameAssignment* nameMap) {
+       try {
+       
+               //if the user has not given a names file read names from blastfile
+               if (nameMap->size() == 0) { readNames(nameMap);  }
+               int nseqs = nameMap->size();
+               
+               if (m->control_pressed) { return 0; }
+
+               ifstream fileHandle;
+               m->openInputFile(blastfile, fileHandle);
+               
+               string firstName, secondName, eScore, currentRow;
+               string repeatName = "";
+               int count = 1;
+               float distance, thisoverlap, refScore;
+               float percentId; 
+               float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
+               
+               ofstream outDist;
+               ofstream outOverlap;
+               
+               //create objects needed for read
+               if (!hclusterWanted) {
+                       matrix = new SparseMatrix();
+               }else{
+                       overlapFile = m->getRootName(blastfile) + "overlap.dist";
+                       distFile = m->getRootName(blastfile) + "hclusterDists.dist";
+                       
+                       m->openOutputFile(overlapFile, outOverlap);
+                       m->openOutputFile(distFile, outDist);
+               }
+               
+               if (m->control_pressed) { 
+                       fileHandle.close();
+                       if (!hclusterWanted) {  delete matrix; }
+                       else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
+                       return 0;
+               }
+               
+               Progress* reading = new Progress("Reading blast:     ", nseqs * nseqs);
+               
+               //this is used to quickly find if we already have a distance for this combo
+               vector< map<int,float> > dists;  dists.resize(nseqs);  //dists[0][1] = distance from seq0 to seq1
+               map<int, float> thisRowsBlastScores;
+               
+               if (!fileHandle.eof()) {
+                       //read in line from file
+                       fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+                       m->gobble(fileHandle);
+                       
+                       currentRow = firstName;
+                       lengthThisSeq = numBases;
+                       repeatName = firstName + secondName;
+                       
+                       if (firstName == secondName) {   refScore = score;  }
+                       else{
+                               //convert name to number
+                               map<string,int>::iterator itA = nameMap->find(firstName);
+                               map<string,int>::iterator itB = nameMap->find(secondName);
+                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
+                               
+                               thisRowsBlastScores[itB->second] = score;
+                               
+                               //calc overlap score
+                               thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+                               
+                               //if there is a valid overlap, add it
+                               if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+                                       if (!hclusterWanted) {
+                                               seqDist overlapValue(itA->second, itB->second, thisoverlap);
+                                               overlap.push_back(overlapValue);
+                                       }else {
+                                               outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+                                       }
+                               }
+                       }
+               }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
+
+                               
+               //read file
+               while(!fileHandle.eof()){  
+               
+                       if (m->control_pressed) { 
+                               fileHandle.close();
+                               if (!hclusterWanted) {  delete matrix; }
+                               else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
+                               delete reading;
+                               return 0;
+                       }
+                       
+                       //read in line from file
+                       fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+                       //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;       
+                       m->gobble(fileHandle);
+                       
+                       string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
+                       
+                       //if this is a new pairing
+                       if (temp != repeatName) {
+                               repeatName = temp; 
+                               
+                               if (currentRow == firstName) {
+                                       //cout << "first = " << firstName << " second = " << secondName << endl;        
+                                       if (firstName == secondName) { 
+                                               refScore = score;  
+                                               reading->update((count + nseqs));  
+                                               count++; 
+                                       }else{
+                                               //convert name to number
+                                               map<string,int>::iterator itA = nameMap->find(firstName);
+                                               map<string,int>::iterator itB = nameMap->find(secondName);
+                                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
+                                               
+                                               //save score
+                                               thisRowsBlastScores[itB->second] = score;
+                                                       
+                                               //calc overlap score
+                                               thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+                                               
+                                               //if there is a valid overlap, add it
+                                               if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+                                                       if (!hclusterWanted) {
+                                                               seqDist overlapValue(itA->second, itB->second, thisoverlap);
+                                                               //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
+                                                               overlap.push_back(overlapValue);
+                                                       }else {
+                                                               outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+                                                       }
+                                               }
+                                       } //end else
+                               }else { //end row
+                                       //convert blast scores to distance and add cell to sparse matrix if we can
+                                       map<int, float>::iterator it;
+                                       map<int, float>::iterator itDist;
+                                       for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
+                                               distance = 1.0 - (it->second / refScore);
+               
+                                               
+                                               //do we already have the distance calculated for b->a
+                                               map<string,int>::iterator itA = nameMap->find(currentRow);
+                                               itDist = dists[it->first].find(itA->second);
+                                               
+                                               //if we have it then compare
+                                               if (itDist != dists[it->first].end()) {
+       
+                                                       //if you want the minimum blast score ratio, then pick max distance
+                                                       if(minWanted) {  distance = max(itDist->second, distance);  }
+                                                       else{   distance = min(itDist->second, distance);  }
+
+                                                       //is this distance below cutoff
+                                                       if (distance < cutoff) {
+                                                               if (!hclusterWanted) {
+                                                                       PCell value(itA->second, it->first, distance);
+                                                                       matrix->addCell(value);
+                                                               }else{
+                                                                       outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
+                                                               }
+                                                       }
+                                                       //not going to need this again
+                                                       dists[it->first].erase(itDist);
+                                               }else { //save this value until we get the other ratio
+                                                       dists[itA->second][it->first] = distance;
+                                               }
+                                       }
+                                       //clear out last rows info
+                                       thisRowsBlastScores.clear();
+                                       
+                                       currentRow = firstName;
+                                       lengthThisSeq = numBases;
+                                       
+                                       //add this row to thisRowsBlastScores
+                                       if (firstName == secondName) {   refScore = score;  }
+                                       else{ //add this row to thisRowsBlastScores
+                                               
+                                               //convert name to number
+                                               map<string,int>::iterator itA = nameMap->find(firstName);
+                                               map<string,int>::iterator itB = nameMap->find(secondName);
+                                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
+                                               
+                                               thisRowsBlastScores[itB->second] = score;
+                                               
+                                               //calc overlap score
+                                               thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+                                               
+                                               //if there is a valid overlap, add it
+                                               if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+                                                       if (!hclusterWanted) {
+                                                               seqDist overlapValue(itA->second, itB->second, thisoverlap);
+                                                               overlap.push_back(overlapValue);
+                                                       }else {
+                                                               outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+                                                       }
+                                               }
+                                       }
+                               }//end if current row
+                       }//end if repeat
+               }//end while
+               
+               //get last rows info stored
+               //convert blast scores to distance and add cell to sparse matrix if we can
+               map<int, float>::iterator it;
+               map<int, float>::iterator itDist;
+               for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
+                       distance = 1.0 - (it->second / refScore);
+                       
+                       //do we already have the distance calculated for b->a
+                       map<string,int>::iterator itA = nameMap->find(currentRow);
+                       itDist = dists[it->first].find(itA->second);
+                       
+                       //if we have it then compare
+                       if (itDist != dists[it->first].end()) {
+                               //if you want the minimum blast score ratio, then pick max distance
+                               if(minWanted) {  distance = max(itDist->second, distance);  }
+                               else{   distance = min(itDist->second, distance);  }
+                               
+                               //is this distance below cutoff
+                               if (distance < cutoff) {
+                                       if (!hclusterWanted) {
+                                               PCell value(itA->second, it->first, distance);
+                                               matrix->addCell(value);
+                                       }else{
+                                               outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
+                                       }
+                               }
+                               //not going to need this again
+                               dists[it->first].erase(itDist);
+                       }else { //save this value until we get the other ratio
+                               dists[itA->second][it->first] = distance;
+                       }
+               }
+               //clear out info
+               thisRowsBlastScores.clear();
+               dists.clear();
+               
+               if (m->control_pressed) { 
+                               fileHandle.close();
+                               if (!hclusterWanted) {  delete matrix; }
+                               else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
+                               delete reading;
+                               return 0;
+               }
+               
+               if (!hclusterWanted) {
+                       sort(overlap.begin(), overlap.end(), compareOverlap);
+               }else {
+                       outDist.close();
+                       outOverlap.close();
+               }
+               
+               if (m->control_pressed) { 
+                               fileHandle.close();
+                               if (!hclusterWanted) {  delete matrix; }
+                               else {  m->mothurRemove(overlapFile);  m->mothurRemove(distFile);  }
+                               delete reading;
+                               return 0;
+               }
+               
+               reading->finish();
+               delete reading;
+               fileHandle.close();
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ReadBlast", "read");
+               exit(1);
+       }
+} 
+/*********************************************************************************************/
+int ReadBlast::readNames(NameAssignment* nameMap) {
+       try {
+               m->mothurOut("Reading names... "); cout.flush();
+               
+               string name, hold, prevName;
+               int num = 1;
+               
+               ifstream in;
+               m->openInputFile(blastfile, in);
+               
+               //ofstream outName;
+               //m->openOutputFile((blastfile + ".tempOutNames"), outName);
+               
+               //read first line
+               in >> prevName;
+       
+               for (int i = 0; i < 11; i++) {  in >> hold;  }
+               m->gobble(in);
+                               
+               //save name in nameMap
+               nameMap->push_back(prevName);
+               
+               while (!in.eof()) {
+                       if (m->control_pressed) { in.close(); return 0; }
+                       
+                       //read line
+                       in >> name;
+       
+                       for (int i = 0; i < 11; i++) {  in >> hold;  }
+                       m->gobble(in);
+                       
+                       //is this a new name?
+                       if (name != prevName) {
+                               prevName = name;
+                               nameMap->push_back(name);
+                               num++;
+                       }
+               }
+       
+               in.close();
+               
+               //write out names file
+               //string outNames = m->getRootName(blastfile) + "names";
+               //ofstream out;
+               //m->openOutputFile(outNames, out);
+               //nameMap->print(out);
+               //out.close();
+               
+               if (m->control_pressed) { return 0; }
+               
+               m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ReadBlast", "readNames");
+               exit(1);
+       }
+} 
+/*********************************************************************************************/
+
+
+