]> git.donarmstrong.com Git - mothur.git/blob - filterseqscommand.cpp
This contains Pat's bug fixes and updates. It represents mothur v.1.3.0
[mothur.git] / filterseqscommand.cpp
1 /*
2  *  filterseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Thomas Ryabin on 5/4/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "filterseqscommand.h"
11
12 /**************************************************************************************/
13
14 FilterSeqsCommand::FilterSeqsCommand(){
15         globaldata = GlobalData::getInstance();
16         
17         if(globaldata->getFastaFile() != "")            {       readSeqs =  new ReadFasta(globaldata->inputFileName);   }
18         else if(globaldata->getNexusFile() != "")       {       readSeqs = new ReadNexus(globaldata->inputFileName);    }
19         else if(globaldata->getClustalFile() != "") {   readSeqs = new ReadClustal(globaldata->inputFileName);  }
20         else if(globaldata->getPhylipFile() != "")      {       readSeqs = new ReadPhylip(globaldata->inputFileName);   }
21         
22         readSeqs->read();
23         db = readSeqs->getDB();
24         numSeqs = db->size();
25         
26         alignmentLength = db->get(0).getLength();
27
28         filter = string(alignmentLength, '1');
29 }
30
31 /**************************************************************************************/
32
33 void FilterSeqsCommand::doHard() {
34         
35         string hardName = globaldata->getHard();
36         string hardFilter = "";
37                 
38         ifstream fileHandle;
39         openInputFile(hardName, fileHandle);
40         
41         fileHandle >> hardFilter;
42         
43         if(hardFilter.length() != filter.length()){
44                 cout << "The hard filter is not the same length as the alignment: Hard filter will not be applied." << endl;
45         }
46         else{
47                 filter = hardFilter;
48         }
49         
50 }
51
52 /**************************************************************************************/
53
54 void FilterSeqsCommand::doTrump() {
55
56         char trump = globaldata->getTrump()[0];
57         
58         for(int i = 0; i < numSeqs; i++) {
59                 string curAligned = db->get(i).getAligned();;
60
61                 for(int j = 0; j < alignmentLength; j++) {
62                         if(curAligned[j] == trump){
63                                 filter[j] = '0';
64                         }
65                 }
66         }
67
68 }
69
70 /**************************************************************************************/
71
72 void FilterSeqsCommand::doVertical() {
73
74         vector<int> counts(alignmentLength, 0);
75         
76         for(int i = 0; i < numSeqs; i++) {
77                 string curAligned = db->get(i).getAligned();;
78                 
79                 for(int j = 0; j < alignmentLength; j++) {
80                         if(curAligned[j] == '-' || curAligned[j] == '.'){
81                                 counts[j]++;
82                         }
83                 }
84         }
85         for(int i=0;i<alignmentLength;i++){
86                 if(counts[i] == numSeqs)        {       filter[i] = '0';                }
87         }
88 }
89
90 /**************************************************************************************/
91
92 void FilterSeqsCommand::doSoft() {
93
94         int softThreshold = numSeqs * (float)atoi(globaldata->getSoft().c_str()) / 100.0;
95
96         vector<int> a(alignmentLength, 0);
97         vector<int> t(alignmentLength, 0);
98         vector<int> g(alignmentLength, 0);
99         vector<int> c(alignmentLength, 0);
100         vector<int> x(alignmentLength, 0);
101         
102         for(int i=0;i<numSeqs;i++){
103                 string curAligned = db->get(i).getAligned();;
104
105                 for(int j=0;j<alignmentLength;j++){
106                         if(toupper(curAligned[j]) == 'A')                                                                               {       a[j]++; }
107                         else if(toupper(curAligned[j]) == 'T' || toupper(curAligned[i]) == 'U') {       t[j]++; }
108                         else if(toupper(curAligned[j]) == 'G')                                                                  {       g[j]++; }
109                         else if(toupper(curAligned[j]) == 'C')                                                                  {       c[j]++; }
110                 }
111         }
112
113         for(int i=0;i<alignmentLength;i++){
114                 if(a[i] < softThreshold && t[i] < softThreshold && g[i] < softThreshold && c[i] < softThreshold){
115                         filter[i] = '0';                        
116                 }
117         }
118 }
119
120 /**************************************************************************************/
121
122 int FilterSeqsCommand::execute() {      
123         try {
124                                                 
125                 if(globaldata->getHard().compare("") != 0)              {       doHard();               }       //      has to be applied first!
126                 if(globaldata->getTrump().compare("") != 0)             {       doTrump();              }
127                 if(globaldata->getVertical() == "T")                    {       doVertical();   }
128                 if(globaldata->getSoft().compare("") != 0)              {       doSoft();               }
129
130                 ofstream outfile;
131                 string filterFile = getRootName(globaldata->inputFileName) + "filter";
132                 openOutputFile(filterFile, outfile);
133
134                 outfile << filter << endl;
135                 outfile.close();
136                 
137                 string filteredFasta = getRootName(globaldata->inputFileName) + "filter.fasta";
138                 openOutputFile(filteredFasta, outfile);
139
140                 for(int i=0;i<numSeqs;i++){
141                         string curAligned = db->get(i).getAligned();
142                         outfile << '>' << db->get(i).getName() << endl;
143                         for(int j=0;j<alignmentLength;j++){
144                                 if(filter[j] == '1'){
145                                         outfile << curAligned[j];
146                                 }
147                         }
148                         outfile << endl;
149                 }
150                 outfile.close();
151                 
152                 int filteredLength = 0;
153                 for(int i=0;i<alignmentLength;i++){
154                         if(filter[i] == '1'){   filteredLength++;       }
155                 }
156                 
157                 cout << endl;
158                 cout << "Length of filtered alignment: " << filteredLength << endl;
159                 cout << "Number of columns removed: " << alignmentLength-filteredLength << endl;
160                 cout << "Length of the original alignment: " << alignmentLength << endl;
161                 cout << "Number of sequences used to construct filter: " << numSeqs << endl;
162                 
163                 globaldata->clear();
164                 
165                 return 0;
166                 
167         }
168         catch(exception& e) {
169                 cout << "Standard Error: " << e.what() << " has occurred in the FilterSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
170                 exit(1);
171         }
172         catch(...) {
173                 cout << "An unknown error has occurred in the FilterSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
174                 exit(1);
175         }
176 }
177
178 /**************************************************************************************/