From: Don Armstrong Date: Fri, 11 Apr 2014 18:37:19 +0000 (-0700) Subject: use hash and file position in deconvolute X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=5bbfddbc952ed490d5cce7f252796ffded21fca1;p=mothur.git use hash and file position in deconvolute --- diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index ec80973..d6c5a5a 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -9,6 +9,10 @@ #include "deconvolutecommand.h" #include "sequence.hpp" +#include +#include +#include + //********************************************************************************************************************** vector DeconvoluteCommand::setParameters(){ @@ -226,16 +230,15 @@ int DeconvoluteCommand::execute() { ofstream outFasta; m->openOutputFile(outFastaFile, outFasta); - map sequenceStrings; //sequenceString -> list of names. "atgc...." -> seq1,seq2,seq3. - map::iterator itStrings; - set nameInFastaFile; //for sanity checking - set::iterator itname; - vector nameFileOrder; + hash hash_fn; + multimap> sequenceStrings; //sequenceString -> list of names. "atgc...." -> seq1,seq2,seq3. + unordered_set nameInFastaFile; //for sanity checking + unordered_set::iterator itname; int count = 0; while (!in.eof()) { if (m->control_pressed) { in.close(); outFasta.close(); m->mothurRemove(outFastaFile); return 0; } - + streampos seq_start = in.tellg(); Sequence seq(in); if (seq.getName() != "") { @@ -245,9 +248,8 @@ int DeconvoluteCommand::execute() { if (itname == nameInFastaFile.end()) { nameInFastaFile.insert(seq.getName()); } else { m->mothurOut("[ERROR]: You already have a sequence named " + seq.getName() + " in your fasta file, sequence names must be unique, please correct."); m->mothurOutEndLine(); } - itStrings = sequenceStrings.find(seq.getAligned()); - - if (itStrings == sequenceStrings.end()) { //this is a new unique sequence + auto itStrings = in_fasta_from_hash_map(sequenceStrings,seq,in); + if (itStrings == sequenceStrings.end()) { //this is a new unique sequence //output to unique fasta file seq.printSequence(outFasta); @@ -255,30 +257,33 @@ int DeconvoluteCommand::execute() { itNames = nameMap.find(seq.getName()); if (itNames == nameMap.end()) { //namefile and fastafile do not match - m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine(); + m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); + m->mothurOutEndLine(); }else { - sequenceStrings[seq.getAligned()] = itNames->second; - nameFileOrder.push_back(seq.getAligned()); + sequenceStrings.emplace(hash_fn(seq.getAligned()), + make_pair(seq_start,seq.getName())); } - }else if (countfile != "") { + }else { + if (countfile != "") { ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table - sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); - }else { sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } + } + sequenceStrings.emplace(hash_fn(seq.getAligned()), + make_pair(seq_start, + seq.getName()) + ); + } }else { //this is a dup - if (oldNameMapFName != "") { - itNames = nameMap.find(seq.getName()); - - if (itNames == nameMap.end()) { //namefile and fastafile do not match - m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine(); - }else { - sequenceStrings[seq.getAligned()] += "," + itNames->second; - } - }else if (countfile != "") { + if (oldNameMapFName != "" && + nameMap.find(seq.getName()) == nameMap.end()) { //namefile and fastafile do not match + m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); + m->mothurOutEndLine(); + } else if (countfile != "") { int num = ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table if (num != 0) { //its in the table - ct.mergeCounts(itStrings->second, seq.getName()); //merges counts and saves in uniques name + ct.mergeCounts(itStrings->second.second, seq.getName()); //merges counts and saves in uniques name } - }else { sequenceStrings[seq.getAligned()] += "," + seq.getName(); } + } + itStrings->second.second += "," + seq.getName(); } count++; @@ -301,23 +306,19 @@ int DeconvoluteCommand::execute() { if (countfile == "") { m->openOutputFile(outNameFile, outNames); outputNames.push_back(outNameFile); outputTypes["name"].push_back(outNameFile); } else { m->openOutputFile(outCountFile, outNames); ct.printHeaders(outNames); outputTypes["count"].push_back(outCountFile); outputNames.push_back(outCountFile); } - for (int i = 0; i < nameFileOrder.size(); i++) { + for (auto it = sequenceStrings.begin(); it != sequenceStrings.end(); it++) { if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - itStrings = sequenceStrings.find(nameFileOrder[i]); - - if (itStrings != sequenceStrings.end()) { - if (countfile == "") { - //get rep name - int pos = (itStrings->second).find_first_of(','); - - if (pos == string::npos) { // only reps itself - outNames << itStrings->second << '\t' << itStrings->second << endl; - }else { - outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl; - } - }else { ct.printSeq(outNames, itStrings->second); } - }else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->control_pressed = true; } + if (countfile == "") { + //get rep name + int pos = (it->second.second).find_first_of(','); + + if (pos == string::npos) { // only reps itself + outNames << it->second.second << '\t' << it->second.second << endl; + }else { + outNames << (it->second.second).substr(0, pos) << '\t' << it->second.second << endl; + } + }else { ct.printSeq(outNames, it->second.second); } } outNames.close(); @@ -354,3 +355,31 @@ int DeconvoluteCommand::execute() { } } /**************************************************************************************/ + +multimap>::iterator + in_fasta_from_hash_map(multimap>& seq_strings, + Sequence& seq, + ifstream& fasta_file) { + streampos cur_pos = fasta_file.tellg(); + const std::hash hash_fn; + auto it_range = seq_strings.equal_range(hash_fn(seq.getAligned())); + // If the hash of the aligned sequence is not in the map, it can't + // possibly be a duplicate + if (it_range.first == seq_strings.end()) { + return seq_strings.end(); + } + // If it is, it might be a hash collision. Read the file to + // determine if that's the case + for (auto it = it_range.first; it != it_range.second; it++) { + streampos pos = it->second.first; + fasta_file.seekg(pos); + // read in sequence + Sequence old_seq(fasta_file); + if (old_seq.getAligned() == seq.getAligned()) { + fasta_file.seekg(cur_pos); + return it; + } + } + fasta_file.seekg(cur_pos); + return seq_strings.end(); +} diff --git a/deconvolutecommand.h b/deconvolutecommand.h index e514bda..3634306 100644 --- a/deconvolutecommand.h +++ b/deconvolutecommand.h @@ -12,6 +12,7 @@ #include "command.hpp" #include "fastamap.h" #include "counttable.h" +#include "sequence.hpp" /* The unique.seqs command reads a fasta file, finds the duplicate sequences and outputs a names file containing 2 columns. The first being the groupname and the second the list of identical sequence names. */ @@ -45,4 +46,9 @@ private: bool abort; }; +multimap>::iterator + in_fasta_from_hash_map(multimap>&, + Sequence&, + ifstream&); + #endif