]> git.donarmstrong.com Git - mothur.git/blob - getrepseqscommand.cpp
added get.repseqs command, started matrix output command
[mothur.git] / getrepseqscommand.cpp
1 /*
2  *  getrepseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 5/19/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getrepseqscommand.h"
11
12 //**********************************************************************************************************************
13 GetRepSeqsCommand::GetRepSeqsCommand(){
14         try {
15                 globaldata = GlobalData::getInstance();
16                 fastafile = globaldata->getFastaFile();
17                 namesfile = globaldata->getNameFile();
18                 openInputFile(fastafile, in);
19                 
20                 fasta = new FastaMap();
21                 
22                 //read in group map info.
23                 groupMap = new GroupMap(globaldata->getGroupFile());
24                 groupMap->readMap();
25                         
26                 //fill filehandles with neccessary ofstreams
27                 int i;
28                 ofstream* temp;
29                 //one for each group
30                 for (i=0; i<groupMap->getNumGroups(); i++) {
31                         temp = new ofstream;
32                         filehandles[groupMap->namesOfGroups[i]] = temp;
33                 }
34                 
35                 //one for shared
36                 temp = new ofstream;
37                 string s = "shared";
38                 filehandles[s] = temp;
39                 
40         }
41         catch(exception& e) {
42                 cout << "Standard Error: " << e.what() << " has occurred in the GetRepSeqsCommand class Function GetRepSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
43                 exit(1);
44         }
45         catch(...) {
46                 cout << "An unknown error has occurred in the GetRepSeqsCommand class function GetRepSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
47                 exit(1);
48         }       
49 }
50
51 //**********************************************************************************************************************
52
53 GetRepSeqsCommand::~GetRepSeqsCommand(){
54         delete input;
55         delete read;
56         delete fasta;
57         delete list;
58 }
59
60 //**********************************************************************************************************************
61
62 int GetRepSeqsCommand::execute(){
63         try {
64                 int count = 1;
65                 string binnames, name, sequence;
66                 
67                 //read fastafile
68                 fasta->readFastaFile(in);
69                 
70                 //set format to list so input can get listvector
71                 globaldata->setFormat("list");
72                 
73                 //if user gave a namesfile then use it
74                 if (namesfile != "") {
75                         readNamesFile();
76                 }
77                 
78                 //read list file
79                 read = new ReadOTUFile(globaldata->getListFile());      
80                 read->read(&*globaldata); 
81                 
82                 input = globaldata->ginput;
83                 list = globaldata->gListVector;
84                                 
85                 while(list != NULL){
86                         
87                         if(globaldata->allLines == 1 || globaldata->lines.count(count) == 1 || globaldata->labels.count(list->getLabel()) == 1){
88                                 
89                                 cout << list->getLabel() << '\t' << count << endl;
90                                 
91                                 //open output list files
92                                 for (int i=0; i<groupMap->getNumGroups(); i++) {//opens an output file for each group
93                                         openOutputFile(fastafile + groupMap->namesOfGroups[i] + list->getLabel() + ".fasta", *(filehandles[groupMap->namesOfGroups[i]]));
94                                         used[groupMap->namesOfGroups[i]] = false;
95                                 }
96                                 string s = "shared";
97                                 openOutputFile(fastafile + s + list->getLabel() + ".fasta", *(filehandles[s]));
98                                 used[s] = false;
99                                 
100                                 
101                                 //for each bin in the list vector
102                                 for (int i = 0; i < list->size(); i++) {
103                                         seq.clear();
104                                         //uses this to determine if the bin is unique to one group or if it is shared
105                                         map<string, string> groups;
106
107                                         //determine if this otu is unique to one group or not
108                                         binnames = list->get(i);
109                                         while (binnames.find_first_of(',') != -1) { 
110                                                 //parse out each name in bin
111                                                 name = binnames.substr(0,binnames.find_first_of(','));
112                                                 binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length());
113                                                 
114                                                 //do work for that name
115                                                 sequence = fasta->getSequence(name);
116                                                 if (sequence != "not found") {
117                                                         string group = groupMap->getGroup(name);
118                                                         if (group != "not found") {  groups[group] = group;     }  //add group to list of groups in this bin
119                                                         else {  
120                                                                 cout << "error sequence " << name << " is not assigned a group in your groupfile. Please correct." << endl;
121                                                                 removeFiles(list->getLabel());
122                                                                 return 0;
123                                                         }
124                                                         name = ">" + name + "|" + toString(i+1);
125                                                         seq[name] = sequence;
126                                                 }else { 
127                                                         cout << name << " is missing from your fasta or name file. Please correct. " << endl; 
128                                                         removeFiles(list->getLabel());
129                                                         return 0;
130                                                 }
131                                                 
132                                         }
133                                         
134                                         //get last name
135                                         sequence = fasta->getSequence(binnames);
136                                         if (sequence != "not found") {
137                                                 string group = groupMap->getGroup(binnames);
138                                                 if (group != "not found") {  groups[group] = group;     }  //add group to list of groups in this bin
139                                                 else {  
140                                                         cout << "error sequence " << binnames << " is not assigned a group in your groupfile. Please correct." << endl;
141                                                         removeFiles(list->getLabel());
142                                                         return 0;
143                                                 }
144                                                 binnames = ">" + binnames + "|" + toString(i+1);  //attach bin number to name
145                                                 seq[binnames] = sequence;
146                                         }else { 
147                                                 cout << binnames << " is missing from your fasta or name file. Please correct. " << endl; 
148                                                 removeFiles(list->getLabel());
149                                                 return 0;
150                                         }
151                                         
152                                         //output each bin to files
153                                         //what file does this bin need to be outputted to 
154                                         if (groups.size() == 1) { //this bin is unique to one group
155                                                 it3 = groups.begin();
156                                                 string uniqueGroup = it3->first;
157                                                 used[uniqueGroup] = true;
158                                                 //print out sequences from that bin to shared file
159                                                 for (it3 = seq.begin(); it3 != seq.end(); it3++){
160                                                         *(filehandles[uniqueGroup]) << it3->first << endl;
161                                                         *(filehandles[uniqueGroup]) << it3->second << endl;
162                                                 }
163                                         }else {//this bin has sequences from multiple groups in it
164                                                 used[s] = true;
165                                                 //print out sequences from that bin to shared file
166                                                 for (it3 = seq.begin(); it3 != seq.end(); it3++){
167                                                         *(filehandles[s]) << it3->first << endl;
168                                                         *(filehandles[s]) << it3->second << endl;
169                                                 }
170                                         }
171                                 }
172                                 
173                                 //close ostreams and remove unused files
174                                 for (it = filehandles.begin(); it != filehandles.end(); it++) {
175                                         it->second->close();
176                                         if (used[it->first] == false) { string filename = fastafile + it->first + list->getLabel() + ".fasta";  remove(filename.c_str());  }
177                                 }
178
179                         }
180                         
181                         delete list;
182                         list = input->getListVector();
183                         count++;
184                 }
185                 
186                 return 0;
187         }
188         catch(exception& e) {
189                 cout << "Standard Error: " << e.what() << " has occurred in the GetRepSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
190                 exit(1);
191         }
192         catch(...) {
193                 cout << "An unknown error has occurred in the GetRepSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
194                 exit(1);
195         }       
196 }
197
198 //**********************************************************************************************************************
199 void GetRepSeqsCommand::readNamesFile() {
200         try {
201                 vector<string> dupNames;
202                 openInputFile(namesfile, inNames);
203                 
204                 string name, names, sequence;
205         
206                 while(inNames){
207                         inNames >> name;                        //read from first column  A
208                         inNames >> names;               //read from second column  A,B,C,D
209                         
210                         dupNames.clear();
211                         
212                         //parse names into vector
213                         splitAtComma(names, dupNames);
214                         
215                         //store names in fasta map
216                         sequence = fasta->getSequence(name);
217                         for (int i = 0; i < dupNames.size(); i++) {
218                                 fasta->push_back(dupNames[i], sequence);
219                         }
220                 
221                         gobble(inNames);
222                 }
223                 inNames.close();
224
225         }
226         catch(exception& e) {
227                 cout << "Standard Error: " << e.what() << " has occurred in the GetRepSeqsCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
228                 exit(1);
229         }
230         catch(...) {
231                 cout << "An unknown error has occurred in the GetRepSeqsCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
232                 exit(1);
233         }       
234 }
235 //**********************************************************************************************************************
236 void GetRepSeqsCommand::removeFiles(string label) {
237         try {
238                         //close ostreams
239                         for (it = filehandles.begin(); it != filehandles.end(); it++) {
240                                 it->second->close();
241                         }
242
243                         //remove output files because there was an error
244                         for (int i=0; i<groupMap->getNumGroups(); i++) {
245                                 string outputFileName = fastafile + groupMap->namesOfGroups[i] + label + ".fasta";
246                                 remove(outputFileName.c_str());
247                         }
248                         string outputFileName = fastafile + "shared"+ label + ".fasta";
249                         remove(outputFileName.c_str());
250
251         }
252         catch(exception& e) {
253                 cout << "Standard Error: " << e.what() << " has occurred in the GetRepSeqsCommand class Function removeFiles. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
254                 exit(1);
255         }
256         catch(...) {
257                 cout << "An unknown error has occurred in the GetRepSeqsCommand class function removeFiles. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
258                 exit(1);
259         }       
260 }
261
262 //**********************************************************************************************************************
263