]> git.donarmstrong.com Git - mothur.git/commitdiff
use hash and file position in deconvolute
authorDon Armstrong <don@donarmstrong.com>
Fri, 11 Apr 2014 18:37:19 +0000 (11:37 -0700)
committerDon Armstrong <don@donarmstrong.com>
Fri, 11 Apr 2014 18:37:19 +0000 (11:37 -0700)
deconvolutecommand.cpp
deconvolutecommand.h

index ec809730a036245f34dfc2bc0331b108c5d10b38..d6c5a5ae0cabf761aa300dab0e241daf6c04c118 100644 (file)
@@ -9,6 +9,10 @@
 
 #include "deconvolutecommand.h"
 #include "sequence.hpp"
+#include <unordered_map>
+#include <unordered_set>
+#include <streambuf>
+
 
 //**********************************************************************************************************************
 vector<string> DeconvoluteCommand::setParameters(){    
@@ -226,16 +230,15 @@ int DeconvoluteCommand::execute() {
                ofstream outFasta;
                m->openOutputFile(outFastaFile, outFasta);
                
-               map<string, string> sequenceStrings; //sequenceString -> list of names.  "atgc...." -> seq1,seq2,seq3.
-               map<string, string>::iterator itStrings;
-               set<string> nameInFastaFile; //for sanity checking
-               set<string>::iterator itname;
-               vector<string> nameFileOrder;
+        hash<string> hash_fn;
+               multimap<size_t,pair<streampos,string>> sequenceStrings; //sequenceString -> list of names.  "atgc...." -> seq1,seq2,seq3.
+               unordered_set<string> nameInFastaFile; //for sanity checking
+        unordered_set<string>::iterator itname;
                int count = 0;
                while (!in.eof()) {
                        
                        if (m->control_pressed) { in.close(); outFasta.close(); m->mothurRemove(outFastaFile); return 0; }
-                       
+                       streampos seq_start = in.tellg();
                        Sequence seq(in);
                        
                        if (seq.getName() != "") {
@@ -245,9 +248,8 @@ int DeconvoluteCommand::execute() {
                                if (itname == nameInFastaFile.end()) { nameInFastaFile.insert(seq.getName());  }
                                else { m->mothurOut("[ERROR]: You already have a sequence named " + seq.getName() + " in your fasta file, sequence names must be unique, please correct."); m->mothurOutEndLine(); }
 
-                               itStrings = sequenceStrings.find(seq.getAligned());
-                               
-                               if (itStrings == sequenceStrings.end()) { //this is a new unique sequence
+                auto itStrings = in_fasta_from_hash_map(sequenceStrings,seq,in);
+                if (itStrings == sequenceStrings.end()) { //this is a new unique sequence
                                        //output to unique fasta file
                                        seq.printSequence(outFasta);
                                        
@@ -255,30 +257,33 @@ int DeconvoluteCommand::execute() {
                                                itNames = nameMap.find(seq.getName());
                                                
                                                if (itNames == nameMap.end()) { //namefile and fastafile do not match
-                                                       m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine();
+                                                       m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct.");
+                            m->mothurOutEndLine();
                                                }else {
-                                                       sequenceStrings[seq.getAligned()] = itNames->second;
-                                                       nameFileOrder.push_back(seq.getAligned());
+                          sequenceStrings.emplace(hash_fn(seq.getAligned()),
+                                                  make_pair(seq_start,seq.getName()));
                                                }
-                                       }else if (countfile != "") { 
+                                       }else {
+                      if (countfile != "") { 
                         ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table
-                        sequenceStrings[seq.getAligned()] = seq.getName();     nameFileOrder.push_back(seq.getAligned());
-                    }else {    sequenceStrings[seq.getAligned()] = seq.getName();      nameFileOrder.push_back(seq.getAligned()); }
+                      }
+                      sequenceStrings.emplace(hash_fn(seq.getAligned()),
+                                              make_pair(seq_start,
+                                                        seq.getName())
+                                              );
+                    }
                                }else { //this is a dup
-                                       if (oldNameMapFName != "") {
-                                               itNames = nameMap.find(seq.getName());
-                                               
-                                               if (itNames == nameMap.end()) { //namefile and fastafile do not match
-                                                       m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine();
-                                               }else {
-                                                       sequenceStrings[seq.getAligned()] += "," + itNames->second;
-                                               }
-                    }else if (countfile != "") { 
+                                       if (oldNameMapFName != "" && 
+                        nameMap.find(seq.getName()) == nameMap.end()) { //namefile and fastafile do not match
+                      m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct.");
+                      m->mothurOutEndLine();
+                    } else if (countfile != "") { 
                         int num = ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table
                         if (num != 0) { //its in the table
-                            ct.mergeCounts(itStrings->second, seq.getName()); //merges counts and saves in uniques name
+                            ct.mergeCounts(itStrings->second.second, seq.getName()); //merges counts and saves in uniques name
                         }
-                    }else {    sequenceStrings[seq.getAligned()] += "," + seq.getName();       }
+                    }
+                    itStrings->second.second += "," + seq.getName();
                                }
                                
                                count++;
@@ -301,23 +306,19 @@ int DeconvoluteCommand::execute() {
                if (countfile == "") { m->openOutputFile(outNameFile, outNames); outputNames.push_back(outNameFile); outputTypes["name"].push_back(outNameFile);  }
         else { m->openOutputFile(outCountFile, outNames); ct.printHeaders(outNames); outputTypes["count"].push_back(outCountFile); outputNames.push_back(outCountFile); }
                
-               for (int i = 0; i < nameFileOrder.size(); i++) {
+               for (auto it = sequenceStrings.begin(); it != sequenceStrings.end(); it++) {
                        if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
                        
-                       itStrings = sequenceStrings.find(nameFileOrder[i]);
-                       
-                       if (itStrings != sequenceStrings.end()) {
-                if (countfile == "") {
-                    //get rep name
-                    int pos = (itStrings->second).find_first_of(',');
-                    
-                    if (pos == string::npos) { // only reps itself
-                        outNames << itStrings->second << '\t' << itStrings->second << endl;
-                    }else {
-                        outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl;
-                    }
-                }else {  ct.printSeq(outNames, itStrings->second);  }
-                       }else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->control_pressed = true; }
+            if (countfile == "") {
+              //get rep name
+              int pos = (it->second.second).find_first_of(',');
+              
+              if (pos == string::npos) { // only reps itself
+                outNames << it->second.second << '\t' << it->second.second << endl;
+              }else {
+                outNames << (it->second.second).substr(0, pos) << '\t' << it->second.second << endl;
+              }
+            }else {  ct.printSeq(outNames, it->second.second);  }
                }
                outNames.close();
                
@@ -354,3 +355,31 @@ int DeconvoluteCommand::execute() {
        }
 }
 /**************************************************************************************/
+
+multimap<size_t,std::pair<std::streampos,string>>::iterator
+  in_fasta_from_hash_map(multimap<size_t,std::pair<std::streampos,string>>& seq_strings,
+                            Sequence& seq,
+                            ifstream& fasta_file) {
+  streampos cur_pos = fasta_file.tellg();
+  const std::hash<std::string> hash_fn;
+  auto it_range = seq_strings.equal_range(hash_fn(seq.getAligned()));
+  // If the hash of the aligned sequence is not in the map, it can't
+  // possibly be a duplicate
+  if (it_range.first == seq_strings.end()) {
+    return seq_strings.end();
+  }
+  // If it is, it might be a hash collision. Read the file to
+  // determine if that's the case
+  for (auto it = it_range.first; it != it_range.second; it++) {
+    streampos pos = it->second.first;
+    fasta_file.seekg(pos);
+    // read in sequence
+    Sequence old_seq(fasta_file);
+    if (old_seq.getAligned() == seq.getAligned()) {
+      fasta_file.seekg(cur_pos);
+      return it;
+    }
+  }
+  fasta_file.seekg(cur_pos);
+  return seq_strings.end();
+}
index e514bdae70f7e0b51854b40baacf44d2b0174c8a..36343064db2879e690ada12d97b04cfa8f8b7c8e 100644 (file)
@@ -12,6 +12,7 @@
 #include "command.hpp"
 #include "fastamap.h"
 #include "counttable.h"
+#include "sequence.hpp"
 
 /* The unique.seqs command reads a fasta file, finds the duplicate sequences and outputs a names file
        containing 2 columns.  The first being the groupname and the second the list of identical sequence names. */ 
@@ -45,4 +46,9 @@ private:
        bool abort;
 };
 
+multimap<size_t,std::pair<std::streampos,string>>::iterator
+  in_fasta_from_hash_map(multimap<size_t,pair<streampos,string>>&,
+                            Sequence&,
+                            ifstream&);
+
 #endif