X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=sequenceparser.h;fp=sequenceparser.h;h=23fcb9ecc983e1cf59ef1fde1f976da80773a152;hb=0caf3fbabaa3ece404f8ce77f4c883dc5b1bf1dc;hp=0000000000000000000000000000000000000000;hpb=1b73ff67c83892a025e597dabd9df6fe7b58206a;p=mothur.git diff --git a/sequenceparser.h b/sequenceparser.h new file mode 100644 index 0000000..23fcb9e --- /dev/null +++ b/sequenceparser.h @@ -0,0 +1,61 @@ +#ifndef SEQUENCEPARSER_H +#define SEQUENCEPARSER_H + +/* + * sequenceParser.h + * Mothur + * + * Created by westcott on 9/9/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + + +#include "mothur.h" +#include "mothurout.h" +#include "sequence.hpp" +#include "groupmap.h" + +/* This class reads a fasta and group file with a namesfile as optional and parses the data by group. + + Note: The sum of all the groups unique sequences will be larger than the original number of unique sequences. + This is because when we parse the name file we make a unique for each group instead of 1 unique for all + groups. + + */ + +class SequenceParser { + + public: + + SequenceParser(string, string); //group, fasta - file mismatches will set m->control_pressed = true + SequenceParser(string, string, string); //group, fasta, name - file mismatches will set m->control_pressed = true + ~SequenceParser(); + + //general operations + int getNumGroups(); + vector getNamesOfGroups(); + bool isValidGroup(string); //return true if string is a valid group + string getGroup(string); //returns group of a specific sequence + + int getNumSeqs(string); //returns the number of unique sequences in a specific group + vector getSeqs(string); //returns unique sequences in a specific group + map getNameMap(string); //returns seqName -> namesOfRedundantSeqs separated by commas for a specific group - the name file format, but each line is parsed by group. + + int getSeqs(string, string, bool); //prints unique sequences in a specific group to a file - group, filename, uchimeFormat=false + int getNameMap(string, string); //print seqName -> namesOfRedundantSeqs separated by commas for a specific group - group, filename + + map getAllSeqsMap(){ return allSeqsMap; } //returns map where the key=sequenceName and the value=representativeSequence - helps us remove duplicates after group by group processing + private: + + GroupMap* groupMap; + MothurOut* m; + + int numSeqs; + map allSeqsMap; + map > seqs; //a vector for each group + map > nameMapPerGroup; //nameMap for each group +}; + +#endif +