]> git.donarmstrong.com Git - mothur.git/blobdiff - chimera.cpp
working on pam
[mothur.git] / chimera.cpp
index 8aaab4b12a4c5c261f8a7a4494ca7ef4f12ea923..53e77322f52ba40eb478e0b385a84982de752d6b 100644 (file)
@@ -8,13 +8,14 @@
  */
 
 #include "chimera.h"
+#include "referencedb.h"
 
 //***************************************************************************************************************
-//this is a vertical filter
-void Chimera::createFilter(vector<Sequence*> seqs) {
+//this is a vertical soft filter
+string Chimera::createFilter(vector<Sequence*> seqs, float t) {
        try {
-               
-               int threshold = int (0.5 * seqs.size());
+               filterString = "";
+               int threshold = int (t * seqs.size());
 //cout << "threshhold = " << threshold << endl;
                
                vector<int> gaps;       gaps.resize(seqs[0]->getAligned().length(), 0);
@@ -22,14 +23,18 @@ void Chimera::createFilter(vector<Sequence*> seqs) {
                vector<int> t;          t.resize(seqs[0]->getAligned().length(), 0);
                vector<int> g;          g.resize(seqs[0]->getAligned().length(), 0);
                vector<int> c;          c.resize(seqs[0]->getAligned().length(), 0);
-               
+       
                filterString = (string(seqs[0]->getAligned().length(), '1'));
                
                //for each sequence
                for (int i = 0; i < seqs.size(); i++) {
                
+                       if (m->control_pressed) { return filterString; }
+               
                        string seqAligned = seqs[i]->getAligned();
                        
+                       if (seqAligned.length() != filterString.length()) {  m->mothurOut(seqs[i]->getName() + " is not the same length as the template sequences. Aborting!\n");  exit(1); }
+               
                        for (int j = 0; j < seqAligned.length(); j++) {
                                //if this spot is a gap
                                if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))   {       gaps[j]++;      }
@@ -41,64 +46,217 @@ void Chimera::createFilter(vector<Sequence*> seqs) {
                }
                
                //zero out spot where all sequences have blanks
+               int numColRemoved = 0;
                for(int i = 0;i < seqs[0]->getAligned().length(); i++){
-                       if(gaps[i] == seqs.size())      {       filterString[i] = '0';  }
+               
+                       if (m->control_pressed) { return filterString; }
+                       
+                       if(gaps[i] == seqs.size())      {       filterString[i] = '0';  numColRemoved++;  }
                        
-                       else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {      filterString[i] = '0';  }
+                       else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {      filterString[i] = '0';  numColRemoved++;  }
                        //cout << "a = " << a[i] <<  " t = " << t[i] <<  " g = " << g[i] <<  " c = " << c[i] << endl;
                }
-//cout << "filter = " << filterString << endl; 
+
+               if (threshold != 0.0) {  m->mothurOut("Filter removed " + toString(numColRemoved) + " columns.");  m->mothurOutEndLine();  }
+               
+               return filterString;
        }
        catch(exception& e) {
-               errorOut(e, "Chimera", "createFilter");
+               m->errorOut(e, "Chimera", "createFilter");
                exit(1);
        }
 }
 //***************************************************************************************************************
-void Chimera::runFilter(vector<Sequence*> seqs) {
+map<int, int> Chimera::runFilter(Sequence* seq) {
        try {
-               
-               //for each sequence
-               for (int i = 0; i < seqs.size(); i++) {
-               
-                       string seqAligned = seqs[i]->getAligned();
-                       string newAligned = "";
+               map<int, int> maskMap;
+               string seqAligned = seq->getAligned();
+               string newAligned = "";
+               int count = 0;
                        
-                       for (int j = 0; j < seqAligned.length(); j++) {
-                               //if this spot is a gap
-                               if (filterString[j] == '1') { newAligned += seqAligned[j]; }
+               for (int j = 0; j < seqAligned.length(); j++) {
+                       //if this spot is a gap
+                       if (filterString[j] == '1') { 
+                               newAligned += seqAligned[j]; 
+                               maskMap[count] = j;
+                               count++;
                        }
-                       
-                       seqs[i]->setAligned(newAligned);
                }
+                       
+               seq->setAligned(newAligned);
                
+               return maskMap;
        }
        catch(exception& e) {
-               errorOut(e, "Chimera", "runFilter");
+               m->errorOut(e, "Chimera", "runFilter");
                exit(1);
        }
 }
-
 //***************************************************************************************************************
 vector<Sequence*> Chimera::readSeqs(string file) {
        try {
-               ifstream in;
-               openInputFile(file, in);
+               
                vector<Sequence*> container;
+               int count = 0;
+               length = 0;
+               unaligned = false;
+               ReferenceDB* rdb = ReferenceDB::getInstance();
+               
+               if (file == "saved") {
+                       
+                       
+                       m->mothurOutEndLine();  m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory.");        m->mothurOutEndLine();
+                       
+                       for (int i = 0; i < rdb->referenceSeqs.size(); i++) {
+                               Sequence* temp = new Sequence(rdb->referenceSeqs[i].getName(), rdb->referenceSeqs[i].getAligned());
+                               
+                               if (count == 0) {  length = temp->getAligned().length();  count++;  } //gets first seqs length
+                               else if (length != temp->getAligned().length()) {       unaligned = true;       }
+                               
+                               if (temp->getName() != "") {  container.push_back(temp);  }
+                       }
+                       
+                       templateFileName = rdb->getSavedReference();
+                       
+               }else {
+                       
+                       m->mothurOut("Reading sequences from " + file + "..."); cout.flush();
+                       
+                       #ifdef USE_MPI
+                               int pid, processors;
+                               vector<unsigned long long> positions;
+                               int numSeqs;
+                               int tag = 2001;
+                               MPI_File inMPI;
+                               MPI_Status status; 
+                       
+                               if (byGroup) {
+                                       char inFileName[1024];
+                                       strcpy(inFileName, file.c_str());
+                                       
+                                       MPI_File_open(MPI_COMM_SELF, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+                                       
+                                       positions = m->setFilePosFasta(file, numSeqs); //fills MPIPos, returns numSeqs
+                                       
+                                       //read file 
+                                       for(int i=0;i<numSeqs;i++){
+                                               
+                                               if (m->control_pressed) { MPI_File_close(&inMPI); return container; }
+                                               
+                                               //read next sequence
+                                               int seqlength = positions[i+1] - positions[i];
+                                               char* buf4 = new char[seqlength];
+                                               
+                                               MPI_File_read_at(inMPI, positions[i], buf4, seqlength, MPI_CHAR, &status);
+                                               
+                                               string tempBuf = buf4;
+                                               if (tempBuf.length() > seqlength) { tempBuf = tempBuf.substr(0, seqlength); }
+                                               delete buf4;
+                                               
+                                               istringstream iss (tempBuf,istringstream::in);
+                                               
+                                               Sequence* current = new Sequence(iss);   
+                                               if (current->getName() != "") {
+                                                       if (count == 0) {  length = current->getAligned().length();  count++;  } //gets first seqs length
+                                                       else if (length != current->getAligned().length()) {    unaligned = true;       }
+                                                       
+                                                       container.push_back(current);  
+                                                       if (rdb->save) { rdb->referenceSeqs.push_back(*current); }
+                                               }
+                                       }
+                                       
+                                       MPI_File_close(&inMPI);
+                                       
+                               }else {                                 
+                                       
+                                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+                                       MPI_Comm_size(MPI_COMM_WORLD, &processors);
+                                       
+                                       //char* inFileName = new char[file.length()];
+                                       //memcpy(inFileName, file.c_str(), file.length());
+                                       
+                                       char inFileName[1024];
+                                       strcpy(inFileName, file.c_str());
+                                       
+                                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+                                       //delete inFileName;
+                                       
+                                       if (pid == 0) {
+                                               positions = m->setFilePosFasta(file, numSeqs); //fills MPIPos, returns numSeqs
+                                               
+                                               //send file positions to all processes
+                                               for(int i = 1; i < processors; i++) { 
+                                                       MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+                                                       MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+                                               }
+                                       }else{
+                                               MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+                                               positions.resize(numSeqs+1);
+                                               MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
+                                       }
+                                       
+                                       //read file 
+                                       for(int i=0;i<numSeqs;i++){
+                                               
+                                               if (m->control_pressed) { MPI_File_close(&inMPI); return container; }
+                                               
+                                               //read next sequence
+                                               int seqlength = positions[i+1] - positions[i];
+                                               char* buf4 = new char[seqlength];
+                                               
+                                               MPI_File_read_at(inMPI, positions[i], buf4, seqlength, MPI_CHAR, &status);
+                                               
+                                               string tempBuf = buf4;
+                                               if (tempBuf.length() > seqlength) { tempBuf = tempBuf.substr(0, seqlength); }
+                                               delete buf4;
+                                               
+                                               istringstream iss (tempBuf,istringstream::in);
+                                               
+                                               Sequence* current = new Sequence(iss);   
+                                               if (current->getName() != "") {
+                                                       if (count == 0) {  length = current->getAligned().length();  count++;  } //gets first seqs length
+                                                       else if (length != current->getAligned().length()) {    unaligned = true;       }
+                                                       
+                                                       container.push_back(current);  
+                                                       if (rdb->save) { rdb->referenceSeqs.push_back(*current); }
+                                               }
+                                       }
+                                       
+                                       MPI_File_close(&inMPI);
+                                       MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
+                               }
+               #else
+
+                       ifstream in;
+                       m->openInputFile(file, in);
+                       
+                       //read in seqs and store in vector
+                       while(!in.eof()){
+                               
+                               if (m->control_pressed) { return container; }
+                               
+                               Sequence* current = new Sequence(in);  m->gobble(in);
+                               
+                               if (count == 0) {  length = current->getAligned().length();  count++;  } //gets first seqs length
+                               else if (length != current->getAligned().length()) {   unaligned = true;        }
+                                                       
+                               if (current->getName() != "") {  
+                                       container.push_back(current);  
+                                       if (rdb->save) { rdb->referenceSeqs.push_back(*current); }
+                               }
+                       }
+                       in.close();
+               #endif
                
-               //read in seqs and store in vector
-               while(!in.eof()){
+                       m->mothurOut("Done."); m->mothurOutEndLine();
                        
-                       Sequence* current = new Sequence(in);
-                       container.push_back(current);
-                       gobble(in);
+                       filterString = (string(container[0]->getAligned().length(), '1'));
                }
                
-               in.close();
                return container;
        }
        catch(exception& e) {
-               errorOut(e, "Chimera", "readSeqs");
+               m->errorOut(e, "Chimera", "readSeqs");
                exit(1);
        }
 }
@@ -112,60 +270,85 @@ void Chimera::setMask(string filename) {
                }else if (filename == "") {  //do nothing 
                        seqMask = "";
                }else{
+               
+       #ifdef USE_MPI  
+                       MPI_File inMPI;
+                       MPI_Offset size;
+                       MPI_Status status;
+                       
+                       //char* inFileName = new char[filename.length()];
+                       //memcpy(inFileName, filename.c_str(), filename.length());
+                       
+                       char inFileName[1024];
+                       strcpy(inFileName, filename.c_str());
+       
+                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+                       MPI_File_get_size(inMPI, &size);
+
+                       //delete inFileName;
+                       
+                       char* buffer = new char[size];
+                       MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
+                       
+                       string tempBuf = buffer;
+                       if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
+                       istringstream iss (tempBuf,istringstream::in);
+
+                       delete buffer;
+                       
+                       if (!iss.eof()) {
+                               Sequence temp(iss);
+                               seqMask = temp.getAligned();
+                       }else {
+                               m->mothurOut("Problem with mask."); m->mothurOutEndLine(); 
+                               seqMask = "";
+                       }
+                       
+                       MPI_File_close(&inMPI);
+       #else
+       
                        ifstream infile;
-                       openInputFile(filename, infile);
+                       m->openInputFile(filename, infile);
                        
-                       while (!infile.eof()) {
+                       if (!infile.eof()) {
                                Sequence temp(infile);
                                seqMask = temp.getAligned();
-                               
-                               gobble(infile);
+                       }else {
+                               m->mothurOut("Problem with mask."); m->mothurOutEndLine(); 
+                               seqMask = "";
                        }
-                       
                        infile.close();
+       #endif
+       
                }
        }
        catch(exception& e) {
-               errorOut(e, "Chimera", "setMask");
+               m->errorOut(e, "Chimera", "setMask");
                exit(1);
        }
 }
 //***************************************************************************************************************
-
-vector< vector<float> > Chimera::readQuantiles() {
-       try {
-       
-               ifstream in;
-               openInputFile(quanfile, in);
-               
-               vector< vector<float> > quan;
-       
-               int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; 
+Sequence* Chimera::getSequence(string name) {
+       try{
+               Sequence* temp;
                
-               while(!in.eof()){
-                       
-                       in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; 
-                       
-                       vector <float> temp;
-                       
-                       temp.push_back(ten); 
-                       temp.push_back(twentyfive);
-                       temp.push_back(fifty);
-                       temp.push_back(seventyfive);
-                       temp.push_back(ninetyfive);
-                       temp.push_back(ninetynine);
-                       
-                       quan.push_back(temp);  
-       
-                       gobble(in);
+               //look through templateSeqs til you find it
+               int spot = -1;
+               for (int i = 0; i < templateSeqs.size(); i++) {
+                       if (name == templateSeqs[i]->getName()) {  
+                               spot = i;
+                               break;
+                       }
                }
                
-               in.close();
-               return quan;
+               if(spot == -1) { m->mothurOut("Error: Could not find sequence."); m->mothurOutEndLine(); return NULL; }
+               
+               temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned());
                
+               return temp;
        }
        catch(exception& e) {
-               errorOut(e, "Chimera", "readQuantiles");
+               m->errorOut(e, "Chimera", "getSequence");
                exit(1);
        }
 }