]> git.donarmstrong.com Git - mothur.git/blobdiff - readblast.cpp
moved mothur's source into a folder to make grabbing just the source easier on github
[mothur.git] / readblast.cpp
diff --git a/readblast.cpp b/readblast.cpp
deleted file mode 100644 (file)
index 2d49477..0000000
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
- *  readblast.cpp
- *  Mothur
- *
- *  Created by westcott on 12/10/09.
- *  Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "readblast.h"
-#include "progress.hpp"
-
-//********************************************************************************************************************
-//sorts lowest to highest
-inline bool compareOverlap(seqDist left, seqDist right){
-       return (left.dist < right.dist);        
-} 
-/*********************************************************************************************/
-ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) {
-       try {
-               m = MothurOut::getInstance();
-               matrix = NULL;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ReadBlast", "ReadBlast");
-               exit(1);
-       }
-} 
-/*********************************************************************************************/
-//assumptions about the blast file: 
-//1. if duplicate lines occur the first line is always best and is chosen
-//2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
-int ReadBlast::read(NameAssignment* nameMap) {
-       try {
-       
-               //if the user has not given a names file read names from blastfile
-               if (nameMap->size() == 0) { readNames(nameMap);  }
-               int nseqs = nameMap->size();
-               
-               if (m->control_pressed) { return 0; }
-
-               ifstream fileHandle;
-               m->openInputFile(blastfile, fileHandle);
-               
-               string firstName, secondName, eScore, currentRow;
-               string repeatName = "";
-               int count = 1;
-               float distance, thisoverlap, refScore;
-               float percentId; 
-               float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
-               
-               ofstream outDist;
-               ofstream outOverlap;
-               
-               //create objects needed for read
-               if (!hclusterWanted) {
-                       matrix = new SparseMatrix();
-               }else{
-                       overlapFile = m->getRootName(blastfile) + "overlap.dist";
-                       distFile = m->getRootName(blastfile) + "hclusterDists.dist";
-                       
-                       m->openOutputFile(overlapFile, outOverlap);
-                       m->openOutputFile(distFile, outDist);
-               }
-               
-               if (m->control_pressed) { 
-                       fileHandle.close();
-                       if (!hclusterWanted) {  delete matrix; }
-                       else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
-                       return 0;
-               }
-               
-               Progress* reading = new Progress("Reading blast:     ", nseqs * nseqs);
-               
-               //this is used to quickly find if we already have a distance for this combo
-               vector< map<int,float> > dists;  dists.resize(nseqs);  //dists[0][1] = distance from seq0 to seq1
-               map<int, float> thisRowsBlastScores;
-               
-               if (!fileHandle.eof()) {
-                       //read in line from file
-                       fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
-                       m->gobble(fileHandle);
-                       
-                       currentRow = firstName;
-                       lengthThisSeq = numBases;
-                       repeatName = firstName + secondName;
-                       
-                       if (firstName == secondName) {   refScore = score;  }
-                       else{
-                               //convert name to number
-                               map<string,int>::iterator itA = nameMap->find(firstName);
-                               map<string,int>::iterator itB = nameMap->find(secondName);
-                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
-                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
-                               
-                               thisRowsBlastScores[itB->second] = score;
-                               
-                               //calc overlap score
-                               thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
-                               
-                               //if there is a valid overlap, add it
-                               if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
-                                       if (!hclusterWanted) {
-                                               seqDist overlapValue(itA->second, itB->second, thisoverlap);
-                                               overlap.push_back(overlapValue);
-                                       }else {
-                                               outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
-                                       }
-                               }
-                       }
-               }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
-
-                               
-               //read file
-               while(!fileHandle.eof()){  
-               
-                       if (m->control_pressed) { 
-                               fileHandle.close();
-                               if (!hclusterWanted) {  delete matrix; }
-                               else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
-                               delete reading;
-                               return 0;
-                       }
-                       
-                       //read in line from file
-                       fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
-                       //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;       
-                       m->gobble(fileHandle);
-                       
-                       string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
-                       
-                       //if this is a new pairing
-                       if (temp != repeatName) {
-                               repeatName = temp; 
-                               
-                               if (currentRow == firstName) {
-                                       //cout << "first = " << firstName << " second = " << secondName << endl;        
-                                       if (firstName == secondName) { 
-                                               refScore = score;  
-                                               reading->update((count + nseqs));  
-                                               count++; 
-                                       }else{
-                                               //convert name to number
-                                               map<string,int>::iterator itA = nameMap->find(firstName);
-                                               map<string,int>::iterator itB = nameMap->find(secondName);
-                                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
-                                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
-                                               
-                                               //save score
-                                               thisRowsBlastScores[itB->second] = score;
-                                                       
-                                               //calc overlap score
-                                               thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
-                                               
-                                               //if there is a valid overlap, add it
-                                               if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
-                                                       if (!hclusterWanted) {
-                                                               seqDist overlapValue(itA->second, itB->second, thisoverlap);
-                                                               //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
-                                                               overlap.push_back(overlapValue);
-                                                       }else {
-                                                               outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
-                                                       }
-                                               }
-                                       } //end else
-                               }else { //end row
-                                       //convert blast scores to distance and add cell to sparse matrix if we can
-                                       map<int, float>::iterator it;
-                                       map<int, float>::iterator itDist;
-                                       for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
-                                               distance = 1.0 - (it->second / refScore);
-               
-                                               
-                                               //do we already have the distance calculated for b->a
-                                               map<string,int>::iterator itA = nameMap->find(currentRow);
-                                               itDist = dists[it->first].find(itA->second);
-                                               
-                                               //if we have it then compare
-                                               if (itDist != dists[it->first].end()) {
-       
-                                                       //if you want the minimum blast score ratio, then pick max distance
-                                                       if(minWanted) {  distance = max(itDist->second, distance);  }
-                                                       else{   distance = min(itDist->second, distance);  }
-
-                                                       //is this distance below cutoff
-                                                       if (distance < cutoff) {
-                                                               if (!hclusterWanted) {
-                                                                       PCell value(itA->second, it->first, distance);
-                                                                       matrix->addCell(value);
-                                                               }else{
-                                                                       outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
-                                                               }
-                                                       }
-                                                       //not going to need this again
-                                                       dists[it->first].erase(itDist);
-                                               }else { //save this value until we get the other ratio
-                                                       dists[itA->second][it->first] = distance;
-                                               }
-                                       }
-                                       //clear out last rows info
-                                       thisRowsBlastScores.clear();
-                                       
-                                       currentRow = firstName;
-                                       lengthThisSeq = numBases;
-                                       
-                                       //add this row to thisRowsBlastScores
-                                       if (firstName == secondName) {   refScore = score;  }
-                                       else{ //add this row to thisRowsBlastScores
-                                               
-                                               //convert name to number
-                                               map<string,int>::iterator itA = nameMap->find(firstName);
-                                               map<string,int>::iterator itB = nameMap->find(secondName);
-                                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
-                                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
-                                               
-                                               thisRowsBlastScores[itB->second] = score;
-                                               
-                                               //calc overlap score
-                                               thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
-                                               
-                                               //if there is a valid overlap, add it
-                                               if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
-                                                       if (!hclusterWanted) {
-                                                               seqDist overlapValue(itA->second, itB->second, thisoverlap);
-                                                               overlap.push_back(overlapValue);
-                                                       }else {
-                                                               outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
-                                                       }
-                                               }
-                                       }
-                               }//end if current row
-                       }//end if repeat
-               }//end while
-               
-               //get last rows info stored
-               //convert blast scores to distance and add cell to sparse matrix if we can
-               map<int, float>::iterator it;
-               map<int, float>::iterator itDist;
-               for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
-                       distance = 1.0 - (it->second / refScore);
-                       
-                       //do we already have the distance calculated for b->a
-                       map<string,int>::iterator itA = nameMap->find(currentRow);
-                       itDist = dists[it->first].find(itA->second);
-                       
-                       //if we have it then compare
-                       if (itDist != dists[it->first].end()) {
-                               //if you want the minimum blast score ratio, then pick max distance
-                               if(minWanted) {  distance = max(itDist->second, distance);  }
-                               else{   distance = min(itDist->second, distance);  }
-                               
-                               //is this distance below cutoff
-                               if (distance < cutoff) {
-                                       if (!hclusterWanted) {
-                                               PCell value(itA->second, it->first, distance);
-                                               matrix->addCell(value);
-                                       }else{
-                                               outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
-                                       }
-                               }
-                               //not going to need this again
-                               dists[it->first].erase(itDist);
-                       }else { //save this value until we get the other ratio
-                               dists[itA->second][it->first] = distance;
-                       }
-               }
-               //clear out info
-               thisRowsBlastScores.clear();
-               dists.clear();
-               
-               if (m->control_pressed) { 
-                               fileHandle.close();
-                               if (!hclusterWanted) {  delete matrix; }
-                               else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile);  }
-                               delete reading;
-                               return 0;
-               }
-               
-               if (!hclusterWanted) {
-                       sort(overlap.begin(), overlap.end(), compareOverlap);
-               }else {
-                       outDist.close();
-                       outOverlap.close();
-               }
-               
-               if (m->control_pressed) { 
-                               fileHandle.close();
-                               if (!hclusterWanted) {  delete matrix; }
-                               else {  m->mothurRemove(overlapFile);  m->mothurRemove(distFile);  }
-                               delete reading;
-                               return 0;
-               }
-               
-               reading->finish();
-               delete reading;
-               fileHandle.close();
-               
-               return 0;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ReadBlast", "read");
-               exit(1);
-       }
-} 
-/*********************************************************************************************/
-int ReadBlast::readNames(NameAssignment* nameMap) {
-       try {
-               m->mothurOut("Reading names... "); cout.flush();
-               
-               string name, hold, prevName;
-               int num = 1;
-               
-               ifstream in;
-               m->openInputFile(blastfile, in);
-               
-               //ofstream outName;
-               //m->openOutputFile((blastfile + ".tempOutNames"), outName);
-               
-               //read first line
-               in >> prevName;
-       
-               for (int i = 0; i < 11; i++) {  in >> hold;  }
-               m->gobble(in);
-                               
-               //save name in nameMap
-               nameMap->push_back(prevName);
-               
-               while (!in.eof()) {
-                       if (m->control_pressed) { in.close(); return 0; }
-                       
-                       //read line
-                       in >> name;
-       
-                       for (int i = 0; i < 11; i++) {  in >> hold;  }
-                       m->gobble(in);
-                       
-                       //is this a new name?
-                       if (name != prevName) {
-                               prevName = name;
-                               nameMap->push_back(name);
-                               num++;
-                       }
-               }
-       
-               in.close();
-               
-               //write out names file
-               //string outNames = m->getRootName(blastfile) + "names";
-               //ofstream out;
-               //m->openOutputFile(outNames, out);
-               //nameMap->print(out);
-               //out.close();
-               
-               if (m->control_pressed) { return 0; }
-               
-               m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine();
-               
-               return 0;
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ReadBlast", "readNames");
-               exit(1);
-       }
-} 
-/*********************************************************************************************/
-
-
-