From: Don Armstrong <don@donarmstrong.com>
Date: Fri, 11 Apr 2014 18:37:19 +0000 (-0700)
Subject: use hash and file position in deconvolute
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=5bbfddbc952ed490d5cce7f252796ffded21fca1;p=mothur.git

use hash and file position in deconvolute
---

diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp
index ec80973..d6c5a5a 100644
--- a/deconvolutecommand.cpp
+++ b/deconvolutecommand.cpp
@@ -9,6 +9,10 @@
 
 #include "deconvolutecommand.h"
 #include "sequence.hpp"
+#include <unordered_map>
+#include <unordered_set>
+#include <streambuf>
+
 
 //**********************************************************************************************************************
 vector<string> DeconvoluteCommand::setParameters(){	
@@ -226,16 +230,15 @@ int DeconvoluteCommand::execute() {
 		ofstream outFasta;
 		m->openOutputFile(outFastaFile, outFasta);
 		
-		map<string, string> sequenceStrings; //sequenceString -> list of names.  "atgc...." -> seq1,seq2,seq3.
-		map<string, string>::iterator itStrings;
-		set<string> nameInFastaFile; //for sanity checking
-		set<string>::iterator itname;
-		vector<string> nameFileOrder;
+        hash<string> hash_fn;
+		multimap<size_t,pair<streampos,string>> sequenceStrings; //sequenceString -> list of names.  "atgc...." -> seq1,seq2,seq3.
+		unordered_set<string> nameInFastaFile; //for sanity checking
+        unordered_set<string>::iterator itname;
 		int count = 0;
 		while (!in.eof()) {
 			
 			if (m->control_pressed) { in.close(); outFasta.close(); m->mothurRemove(outFastaFile); return 0; }
-			
+			streampos seq_start = in.tellg();
 			Sequence seq(in);
 			
 			if (seq.getName() != "") {
@@ -245,9 +248,8 @@ int DeconvoluteCommand::execute() {
 				if (itname == nameInFastaFile.end()) { nameInFastaFile.insert(seq.getName());  }
 				else { m->mothurOut("[ERROR]: You already have a sequence named " + seq.getName() + " in your fasta file, sequence names must be unique, please correct."); m->mothurOutEndLine(); }
 
-				itStrings = sequenceStrings.find(seq.getAligned());
-				
-				if (itStrings == sequenceStrings.end()) { //this is a new unique sequence
+                auto itStrings = in_fasta_from_hash_map(sequenceStrings,seq,in);
+                if (itStrings == sequenceStrings.end()) { //this is a new unique sequence
 					//output to unique fasta file
 					seq.printSequence(outFasta);
 					
@@ -255,30 +257,33 @@ int DeconvoluteCommand::execute() {
 						itNames = nameMap.find(seq.getName());
 						
 						if (itNames == nameMap.end()) { //namefile and fastafile do not match
-							m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine();
+							m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct.");
+                            m->mothurOutEndLine();
 						}else {
-							sequenceStrings[seq.getAligned()] = itNames->second;
-							nameFileOrder.push_back(seq.getAligned());
+                          sequenceStrings.emplace(hash_fn(seq.getAligned()),
+                                                  make_pair(seq_start,seq.getName()));
 						}
-					}else if (countfile != "") { 
+					}else {
+                      if (countfile != "") { 
                         ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table
-                        sequenceStrings[seq.getAligned()] = seq.getName();	nameFileOrder.push_back(seq.getAligned());
-                    }else {	sequenceStrings[seq.getAligned()] = seq.getName();	nameFileOrder.push_back(seq.getAligned()); }
+                      }
+                      sequenceStrings.emplace(hash_fn(seq.getAligned()),
+                                              make_pair(seq_start,
+                                                        seq.getName())
+                                              );
+                    }
 				}else { //this is a dup
-					if (oldNameMapFName != "") {
-						itNames = nameMap.find(seq.getName());
-						
-						if (itNames == nameMap.end()) { //namefile and fastafile do not match
-							m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine();
-						}else {
-							sequenceStrings[seq.getAligned()] += "," + itNames->second;
-						}
-                    }else if (countfile != "") { 
+					if (oldNameMapFName != "" && 
+                        nameMap.find(seq.getName()) == nameMap.end()) { //namefile and fastafile do not match
+                      m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct.");
+                      m->mothurOutEndLine();
+                    } else if (countfile != "") { 
                         int num = ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table
                         if (num != 0) { //its in the table
-                            ct.mergeCounts(itStrings->second, seq.getName()); //merges counts and saves in uniques name
+                            ct.mergeCounts(itStrings->second.second, seq.getName()); //merges counts and saves in uniques name
                         }
-                    }else {	sequenceStrings[seq.getAligned()] += "," + seq.getName();	}
+                    }
+                    itStrings->second.second += "," + seq.getName();
 				}
 				
 				count++;
@@ -301,23 +306,19 @@ int DeconvoluteCommand::execute() {
 		if (countfile == "") { m->openOutputFile(outNameFile, outNames); outputNames.push_back(outNameFile); outputTypes["name"].push_back(outNameFile);  }
         else { m->openOutputFile(outCountFile, outNames); ct.printHeaders(outNames); outputTypes["count"].push_back(outCountFile); outputNames.push_back(outCountFile); }
 		
-		for (int i = 0; i < nameFileOrder.size(); i++) {
+		for (auto it = sequenceStrings.begin(); it != sequenceStrings.end(); it++) {
 			if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
 			
-			itStrings = sequenceStrings.find(nameFileOrder[i]);
-			
-			if (itStrings != sequenceStrings.end()) {
-                if (countfile == "") {
-                    //get rep name
-                    int pos = (itStrings->second).find_first_of(',');
-                    
-                    if (pos == string::npos) { // only reps itself
-                        outNames << itStrings->second << '\t' << itStrings->second << endl;
-                    }else {
-                        outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl;
-                    }
-                }else {  ct.printSeq(outNames, itStrings->second);  }
-			}else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->control_pressed = true; }
+            if (countfile == "") {
+              //get rep name
+              int pos = (it->second.second).find_first_of(',');
+              
+              if (pos == string::npos) { // only reps itself
+                outNames << it->second.second << '\t' << it->second.second << endl;
+              }else {
+                outNames << (it->second.second).substr(0, pos) << '\t' << it->second.second << endl;
+              }
+            }else {  ct.printSeq(outNames, it->second.second);  }
 		}
 		outNames.close();
 		
@@ -354,3 +355,31 @@ int DeconvoluteCommand::execute() {
 	}
 }
 /**************************************************************************************/
+
+multimap<size_t,std::pair<std::streampos,string>>::iterator
+  in_fasta_from_hash_map(multimap<size_t,std::pair<std::streampos,string>>& seq_strings,
+                            Sequence& seq,
+                            ifstream& fasta_file) {
+  streampos cur_pos = fasta_file.tellg();
+  const std::hash<std::string> hash_fn;
+  auto it_range = seq_strings.equal_range(hash_fn(seq.getAligned()));
+  // If the hash of the aligned sequence is not in the map, it can't
+  // possibly be a duplicate
+  if (it_range.first == seq_strings.end()) {
+    return seq_strings.end();
+  }
+  // If it is, it might be a hash collision. Read the file to
+  // determine if that's the case
+  for (auto it = it_range.first; it != it_range.second; it++) {
+    streampos pos = it->second.first;
+    fasta_file.seekg(pos);
+    // read in sequence
+    Sequence old_seq(fasta_file);
+    if (old_seq.getAligned() == seq.getAligned()) {
+      fasta_file.seekg(cur_pos);
+      return it;
+    }
+  }
+  fasta_file.seekg(cur_pos);
+  return seq_strings.end();
+}
diff --git a/deconvolutecommand.h b/deconvolutecommand.h
index e514bda..3634306 100644
--- a/deconvolutecommand.h
+++ b/deconvolutecommand.h
@@ -12,6 +12,7 @@
 #include "command.hpp"
 #include "fastamap.h"
 #include "counttable.h"
+#include "sequence.hpp"
 
 /* The unique.seqs command reads a fasta file, finds the duplicate sequences and outputs a names file
 	containing 2 columns.  The first being the groupname and the second the list of identical sequence names. */ 
@@ -45,4 +46,9 @@ private:
 	bool abort;
 };
 
+multimap<size_t,std::pair<std::streampos,string>>::iterator
+  in_fasta_from_hash_map(multimap<size_t,pair<streampos,string>>&,
+                            Sequence&,
+                            ifstream&);
+
 #endif