1 #ifndef SEQUENCEPARSER_H
2 #define SEQUENCEPARSER_H
8 * Created by westcott on 9/9/11.
9 * Copyright 2011 Schloss Lab. All rights reserved.
15 #include "mothurout.h"
16 #include "sequence.hpp"
19 /* This class reads a fasta and group file with a namesfile as optional and parses the data by group.
21 Note: The sum of all the groups unique sequences will be larger than the original number of unique sequences.
22 This is because when we parse the name file we make a unique for each group instead of 1 unique for all
27 class SequenceParser {
31 SequenceParser(string, string); //group, fasta - file mismatches will set m->control_pressed = true
32 SequenceParser(string, string, string); //group, fasta, name - file mismatches will set m->control_pressed = true
37 vector<string> getNamesOfGroups();
38 bool isValidGroup(string); //return true if string is a valid group
40 int getNumSeqs(string); //returns the number of unique sequences in a specific group
41 vector<Sequence> getSeqs(string); //returns unique sequences in a specific group
42 map<string, string> getNameMap(string); //returns seqName -> namesOfRedundantSeqs separated by commas for a specific group - the name file format, but each line is parsed by group.
44 int getSeqs(string, string, bool); //prints unique sequences in a specific group to a file - group, filename, uchimeFormat=false
45 int getNameMap(string, string); //print seqName -> namesOfRedundantSeqs separated by commas for a specific group - group, filename
47 map<string, string> getAllSeqsMap(){ return allSeqsMap; } //returns map where the key=sequenceName and the value=representativeSequence - helps us remove duplicates after group by group processing
54 map<string, string> allSeqsMap;
55 map<string, vector<Sequence> > seqs; //a vector for each group
56 map<string, map<string, string> > nameMapPerGroup; //nameMap for each group