]> git.donarmstrong.com Git - mothur.git/blob - sequenceparser.cpp
working on windows paralellization, added trimOligos class to be used by trim.flows...
[mothur.git] / sequenceparser.cpp
1 /*
2  *  sequenceParser.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/9/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "sequenceParser.h"
11
12
13 /************************************************************/
14 SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
15         try {
16                 
17                 m = MothurOut::getInstance();
18                 int error;
19                 
20                 //read group file
21                 groupMap = new GroupMap(groupFile);
22                 error = groupMap->readMap();
23                 
24                 if (error == 1) { m->control_pressed = true; }
25                 
26                 //initialize maps
27                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
28                 for (int i = 0; i < namesOfGroups.size(); i++) {
29                         vector<Sequence> temp;
30                         map<string, string> tempMap;
31                         seqs[namesOfGroups[i]] = temp;
32                         nameMapPerGroup[namesOfGroups[i]] = tempMap;
33                 }
34                 
35                 //read fasta file making sure each sequence is in the group file
36                 ifstream in;
37                 m->openInputFile(fastaFile, in);
38                 
39                 map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
40                 while (!in.eof()) {
41                         
42                         if (m->control_pressed) { break; }
43                         
44                         Sequence seq(in); m->gobble(in);
45                         
46                         if (seq.getName() != "") {
47                                 
48                                  string group = groupMap->getGroup(seq.getName());
49                                  if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
50                                  else { 
51                                          seqs[group].push_back(seq);    
52                                          seqName[seq.getName()] = seq.getAligned();
53                                  }
54                         }
55                 }
56                 in.close();
57                                  
58                 if (error == 1) { m->control_pressed = true; }
59                                  
60                 //read name file
61                 ifstream inName;
62                 m->openInputFile(nameFile, inName);
63                 
64                 string first, second;
65                 int countName = 0;
66                 while(!inName.eof()) {
67                         
68                         if (m->control_pressed) { break; }
69                         
70                         inName >> first; m->gobble(inName);
71                         inName >> second; m->gobble(inName);
72                         
73                         vector<string> names;
74                         m->splitAtChar(second, names, ',');
75                         
76                         //get aligned string for these seqs from the fasta file
77                         string alignedString = "";
78                         map<string, string>::iterator itAligned = seqName.find(names[0]);
79                         if (itAligned == seqName.end()) {
80                                 error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
81                         }else {
82                                 alignedString = itAligned->second;
83                         }
84                         
85                         //separate by group - parse one line in name file
86                         map<string, string> splitMap; //group -> name1,name2,...
87                         map<string, string>::iterator it;
88                         for (int i = 0; i < names.size(); i++) {
89                                 
90                                 string group = groupMap->getGroup(names[i]);
91                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
92                                 else {  
93                                         
94                                         it = splitMap.find(group);
95                                         if (it != splitMap.end()) { //adding seqs to this group
96                                                 (it->second) += "," + names[i];
97                                                 countName++;
98                                         }else { //first sighting of this group
99                                                 splitMap[group] = names[i];
100                                                 countName++;
101                                                 
102                                                 //is this seq in the fasta file?
103                                                 if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
104                                                         Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
105                                                         seqs[group].push_back(tempSeq);
106                                                 }
107                                         }
108                                 }
109                         }
110                         
111                         
112                         //fill nameMapPerGroup - holds all lines in namefile separated by group
113                         for (it = splitMap.begin(); it != splitMap.end(); it++) {
114                                 //grab first name
115                                 string firstName = "";
116                                 for(int i = 0; i < (it->second).length(); i++) {
117                                         if (((it->second)[i]) != ',') {
118                                                 firstName += ((it->second)[i]);
119                                         }else { break; }
120                                 }
121                                 
122                                 //group1 -> seq1 -> seq1,seq2,seq3
123                                 nameMapPerGroup[it->first][firstName] = it->second;
124                         }
125                 }
126                 
127                 inName.close();
128                 
129                 if (error == 1) { m->control_pressed = true; }
130                 
131                 if (countName != (groupMap->getNumSeqs())) {
132                         m->mothurOutEndLine();
133                         m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
134                         m->mothurOutEndLine();
135                         m->control_pressed = true;
136                 }
137                 
138         }
139         catch(exception& e) {
140                 m->errorOut(e, "SequenceParser", "SequenceParser");
141                 exit(1);
142         }
143 }
144 /************************************************************/
145 SequenceParser::SequenceParser(string groupFile, string fastaFile) {
146         try {
147                 
148                 m = MothurOut::getInstance();
149                 int error;
150                 
151                 //read group file
152                 groupMap = new GroupMap(groupFile);
153                 error = groupMap->readMap();
154                 
155                 if (error == 1) { m->control_pressed = true; }
156                 
157                 //initialize maps
158                 vector<string> namesOfGroups = groupMap->getNamesOfGroups();
159                 for (int i = 0; i < namesOfGroups.size(); i++) {
160                         vector<Sequence> temp;
161                         seqs[namesOfGroups[i]] = temp;
162                 }
163                 
164                 //read fasta file making sure each sequence is in the group file
165                 ifstream in;
166                 m->openInputFile(fastaFile, in);
167                 int count = 0;
168                 
169                 while (!in.eof()) {
170                         
171                         if (m->control_pressed) { break; }
172                         
173                         Sequence seq(in); m->gobble(in);
174                         
175                         if (seq.getName() != "") {
176                                 
177                                 string group = groupMap->getGroup(seq.getName());
178                                 if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
179                                 else {  seqs[group].push_back(seq);     count++; }
180                         }
181                 }
182                 in.close();
183                 
184                 if (error == 1) { m->control_pressed = true; }
185                 
186                 if (count != (groupMap->getNumSeqs())) {
187                         m->mothurOutEndLine();
188                         m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
189                         if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
190                         m->mothurOutEndLine();
191                         m->control_pressed = true;
192                 }
193                 
194         }
195         catch(exception& e) {
196                 m->errorOut(e, "SequenceParser", "SequenceParser");
197                 exit(1);
198         }
199 }
200 /************************************************************/
201 SequenceParser::~SequenceParser(){ delete groupMap; }
202 /************************************************************/
203 int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
204 /************************************************************/
205 vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
206 /************************************************************/
207 bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
208 /************************************************************/
209 string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
210 /************************************************************/
211 int SequenceParser::getNumSeqs(string g){ 
212         try {
213                 map<string, vector<Sequence> >::iterator it;
214                 int num = 0;
215                 
216                 it = seqs.find(g);
217                 if(it == seqs.end()) {
218                         m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
219                 }else {
220                         num = (it->second).size();
221                 }
222                 
223                 return num; 
224         }
225         catch(exception& e) {
226                 m->errorOut(e, "SequenceParser", "getNumSeqs");
227                 exit(1);
228         }
229 }
230 /************************************************************/
231 vector<Sequence> SequenceParser::getSeqs(string g){ 
232         try {
233                 map<string, vector<Sequence> >::iterator it;
234                 vector<Sequence> seqForThisGroup;
235                 
236                 it = seqs.find(g);
237                 if(it == seqs.end()) {
238                         m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
239                 }else {
240                         seqForThisGroup = it->second;
241                 }
242                 
243                 return seqForThisGroup; 
244         }
245         catch(exception& e) {
246                 m->errorOut(e, "SequenceParser", "getSeqs");
247                 exit(1);
248         }
249 }
250 /************************************************************/
251 map<string, string> SequenceParser::getNameMap(string g){ 
252         try {
253                 map<string, map<string, string> >::iterator it;
254                 map<string, string> nameMapForThisGroup;
255                 
256                 it = nameMapPerGroup.find(g);
257                 if(it == nameMapPerGroup.end()) {
258                         m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
259                 }else {
260                         nameMapForThisGroup = it->second;
261                 }
262                 
263                 return nameMapForThisGroup; 
264         }
265         catch(exception& e) {
266                 m->errorOut(e, "SequenceParser", "getNameMap");
267                 exit(1);
268         }
269 }
270 /************************************************************/
271
272
273