]> git.donarmstrong.com Git - mothur.git/blob - filterseqscommand.cpp
e024a242ed8b35771580b00cffd66bc7dd78d66d
[mothur.git] / filterseqscommand.cpp
1 /*
2  *  filterseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Thomas Ryabin on 5/4/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "filterseqscommand.h"
11
12 /**************************************************************************************/
13
14 FilterSeqsCommand::FilterSeqsCommand(){
15
16         globaldata = GlobalData::getInstance();
17         
18         if(globaldata->getFastaFile() == "")            {       cout << "You must enter a fasta formatted file" << endl;        }
19         trump = globaldata->getTrump()[0];
20         numSeqs = 0;
21
22 }
23
24 /**************************************************************************************/
25
26 void FilterSeqsCommand::doHard() {
27         
28         string hardName = globaldata->getHard();
29         string hardFilter = "";
30                 
31         ifstream fileHandle;
32         openInputFile(hardName, fileHandle);
33         
34         fileHandle >> filter;
35
36 }
37
38 /**************************************************************************************/
39
40 void FilterSeqsCommand::doTrump(Sequence seq) {
41         
42         string curAligned = seq.getAligned();
43         
44         for(int j = 0; j < alignmentLength; j++) {
45                 if(curAligned[j] == trump){
46                         filter[j] = '0';
47                 }
48         }
49
50 }
51
52 /**************************************************************************************/
53
54 void FilterSeqsCommand::doVertical() {
55
56         for(int i=0;i<alignmentLength;i++){
57                 if(gap[i] == numSeqs)   {       filter[i] = '0';        }
58         }
59         
60 }
61
62 /**************************************************************************************/
63
64 void FilterSeqsCommand::doSoft() {
65         
66         int threshold = int (soft * numSeqs);
67         bool keep = 0;
68         
69         for(int i=0;i<alignmentLength;i++){
70                 if(a[i] >= threshold)           {       keep = 1;       }
71                 else if(t[i] >= threshold)      {       keep = 1;       }
72                 else if(g[i] >= threshold)      {       keep = 1;       }
73                 else if(c[i] >= threshold)      {       keep = 1;       }
74                 
75                 if(keep == 0)   {       filter[i] = 0;          }
76         }
77 }
78
79 /**************************************************************************************/
80
81 void FilterSeqsCommand::getFreqs(Sequence seq) {
82
83         string curAligned = seq.getAligned();;
84         
85         for(int j=0;j<alignmentLength;j++){
86                 if(toupper(curAligned[j]) == 'A')                                                                               {       a[j]++;         }
87                 else if(toupper(curAligned[j]) == 'T' || toupper(curAligned[j]) == 'U') {       t[j]++;         }
88                 else if(toupper(curAligned[j]) == 'G')                                                                  {       g[j]++;         }
89                 else if(toupper(curAligned[j]) == 'C')                                                                  {       c[j]++;         }
90                 else if(curAligned[j] == '-' || curAligned[j] == '.')                                   {       gap[j]++;       }
91         }
92         
93 }
94
95 /**************************************************************************************/
96
97 int FilterSeqsCommand::execute() {      
98         try {
99                 ifstream inFASTA;
100                 openInputFile(globaldata->getFastaFile(), inFASTA);
101                 
102                 Sequence testSeq(inFASTA);
103                 alignmentLength = testSeq.getAlignLength();
104                 inFASTA.seekg(0);
105                 
106                 if(globaldata->getSoft() != "" || isTrue(globaldata->getVertical())){
107                         a.assign(alignmentLength, 0);
108                         t.assign(alignmentLength, 0);
109                         g.assign(alignmentLength, 0);
110                         c.assign(alignmentLength, 0);
111                         gap.assign(alignmentLength, 0);
112                 }
113                 if(globaldata->getSoft() != ""){
114                         soft = (float)atoi(globaldata->getSoft().c_str()) / 100.0;
115                 }
116                 
117                 if(globaldata->getHard().compare("") != 0)      {       doHard();                                                               }
118                 else                                                                            {       filter = string(alignmentLength, '1');  }
119
120                 if(globaldata->getTrump().compare("") != 0 || isTrue(globaldata->getVertical()) || globaldata->getSoft().compare("") != 0){
121                 
122                         while(!inFASTA.eof()){
123                                 Sequence seq(inFASTA);
124                                 if(globaldata->getTrump().compare("") != 0)     {       doTrump(seq);           }
125                                 if(isTrue(globaldata->getVertical()) || globaldata->getSoft().compare("") != 0){        getFreqs(seq);  }
126                                 numSeqs++;
127                                 cout.flush();
128                         }
129                 
130                 }
131                 inFASTA.close();
132                 
133                 if(isTrue(globaldata->getVertical()) == 1)      {       doVertical();   }
134                 if(globaldata->getSoft().compare("") != 0)      {       doSoft();               }                       
135
136                 ofstream outFilter;
137                 string filterFile = getRootName(globaldata->inputFileName) + "filter";
138                 openOutputFile(filterFile, outFilter);
139                 outFilter << filter << endl;
140                 outFilter.close();
141                 
142
143                 openInputFile(globaldata->getFastaFile(), inFASTA);
144                 string filteredFasta = getRootName(globaldata->inputFileName) + "filter.fasta";
145                 ofstream outFASTA;
146                 openOutputFile(filteredFasta, outFASTA);
147
148                 numSeqs = 0;
149                 while(!inFASTA.eof()){
150                         Sequence seq(inFASTA);
151                         string align = seq.getAligned();
152                         string filterSeq = "";
153         
154                         for(int j=0;j<alignmentLength;j++){
155                                 if(filter[j] == '1'){
156                                         filterSeq += align[j];
157                                 }
158                         }
159
160                         outFASTA << '>' << seq.getName() << endl << filterSeq << endl;
161                         numSeqs++;
162                         gobble(inFASTA);
163                 }
164                 outFASTA.close();
165                 inFASTA.close();
166                 
167                 
168                 int filteredLength = 0;
169                 for(int i=0;i<alignmentLength;i++){
170                         if(filter[i] == '1'){   filteredLength++;       }
171                 }
172                 
173                 cout << endl;
174                 cout << "Length of filtered alignment: " << filteredLength << endl;
175                 cout << "Number of columns removed: " << alignmentLength-filteredLength << endl;
176                 cout << "Length of the original alignment: " << alignmentLength << endl;
177                 cout << "Number of sequences used to construct filter: " << numSeqs << endl;
178                 
179                 globaldata->clear();
180                 
181                 return 0;
182                 
183         }
184         catch(exception& e) {
185                 cout << "Standard Error: " << e.what() << " has occurred in the FilterSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
186                 exit(1);
187         }
188         catch(...) {
189                 cout << "An unknown error has occurred in the FilterSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
190                 exit(1);
191         }
192 }
193
194 /**************************************************************************************/