]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.cpp
merged pat's trim seqs edits with sarah's major overhaul of global data; also added...
[mothur.git] / screenseqscommand.cpp
1 /*
2  *  screenseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/3/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
12
13 //***************************************************************************************************************
14
15 ScreenSeqsCommand::ScreenSeqsCommand(string option){
16         try {
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         //valid paramters for this command
24                         string AlignArray[] =  {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"};
25                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
26                         
27                         OptionParser parser(option);
28                         map<string,string> parameters = parser.getParameters();
29                         
30                         ValidParameters validParameter;
31                         
32                         //check to make sure all parameters are valid for command
33                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
34                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
35                         }
36                         
37                         //check for required parameters
38                         fastafile = validParameter.validFile(parameters, "fasta", true);
39                         if (fastafile == "not found") { cout << "fasta is a required parameter for the screen.seqs command." << endl; abort = true; }
40                         else if (fastafile == "not open") { abort = true; }     
41                 
42                         groupfile = validParameter.validFile(parameters, "group", true);
43                         if (groupfile == "not open") { abort = true; }  
44                         else if (groupfile == "not found") { groupfile = ""; }
45                         
46                         namefile = validParameter.validFile(parameters, "name", true);
47                         if (namefile == "not open") { abort = true; }
48                         else if (namefile == "not found") { namefile = ""; }    
49
50                 
51                         //check for optional parameter and set defaults
52                         // ...at some point should added some additional type checking...
53                         string temp;
54                         temp = validParameter.validFile(parameters, "start", false);                    if (temp == "not found") { temp = "-1"; }
55                         convert(temp, startPos); 
56                 
57                         temp = validParameter.validFile(parameters, "end", false);                              if (temp == "not found") { temp = "-1"; }
58                         convert(temp, endPos);  
59
60                         temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
61                         convert(temp, maxAmbig);  
62
63                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
64                         convert(temp, maxHomoP);  
65
66                         temp = validParameter.validFile(parameters, "minlength", false);                if (temp == "not found") { temp = "-1"; }
67                         convert(temp, minLength); 
68                         
69                         temp = validParameter.validFile(parameters, "maxlength", false);                if (temp == "not found") { temp = "-1"; }
70                         convert(temp, maxLength); 
71                 }
72
73         }
74         catch(exception& e) {
75                 cout << "Standard Error: " << e.what() << " has occurred in the ScreenSeqsCommand class Function ScreenSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
76                 exit(1);
77         }
78         catch(...) {
79                 cout << "An unknown error has occurred in the ScreenSeqsCommand class function ScreenSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
80                 exit(1);
81         }       
82 }
83 //**********************************************************************************************************************
84
85 void ScreenSeqsCommand::help(){
86         try {
87                 cout << "The screen.seqs command reads a fastafile and creates ....." << "\n";
88                 cout << "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, and group." << "\n";
89                 cout << "The fasta parameter is required." << "\n";
90                 cout << "The start parameter .... The default is -1." << "\n";
91                 cout << "The end parameter .... The default is -1." << "\n";
92                 cout << "The maxambig parameter .... The default is -1." << "\n";
93                 cout << "The maxhomop parameter .... The default is -1." << "\n";
94                 cout << "The minlength parameter .... The default is -1." << "\n";
95                 cout << "The maxlength parameter .... The default is -1." << "\n";
96                 cout << "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile." << "\n";
97                 cout << "The screen.seqs command should be in the following format: " << "\n";
98                 cout << "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  " << "\n";
99                 cout << "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  " << "\n";    
100                 cout << "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...)." << "\n";
101                 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta)." << "\n" << "\n";
102
103         }
104         catch(exception& e) {
105                 cout << "Standard Error: " << e.what() << " has occurred in the ScreenSeqsCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
106                 exit(1);
107         }
108         catch(...) {
109                 cout << "An unknown error has occurred in the ScreenSeqsCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
110                 exit(1);
111         }       
112 }
113
114 //***************************************************************************************************************
115
116 ScreenSeqsCommand::~ScreenSeqsCommand(){        /*      do nothing      */      }
117
118 //***************************************************************************************************************
119
120 int ScreenSeqsCommand::execute(){
121         try{
122                 
123                 if (abort == true) { return 0; }
124                                 
125                 ifstream inFASTA;
126                 openInputFile(fastafile, inFASTA);
127                 
128                 set<string> badSeqNames;
129                 
130                 string goodSeqFile = getRootName(fastafile) + "good" + getExtension(fastafile);
131                 string badSeqFile = getRootName(fastafile) + "bad" + getExtension(fastafile);
132                 
133                 ofstream goodSeqOut;    openOutputFile(goodSeqFile, goodSeqOut);
134                 ofstream badSeqOut;             openOutputFile(badSeqFile, badSeqOut);          
135                 
136                 while(!inFASTA.eof()){
137                         Sequence currSeq(inFASTA);
138                         bool goodSeq = 1;               //      innocent until proven guilty
139                         if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
140                         if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
141                         if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
142                         if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
143                         if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
144                         if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
145                         
146                         if(goodSeq == 1){
147                                 currSeq.printSequence(goodSeqOut);      
148                         }
149                         else{
150                                 currSeq.printSequence(badSeqOut);       
151                                 badSeqNames.insert(currSeq.getName());
152                         }
153                         gobble(inFASTA);
154                 }       
155                 if(namefile != ""){
156                         screenNameGroupFile(badSeqNames);
157                 }
158                 else if(groupfile != ""){
159                         screenGroupFile(badSeqNames);
160                 }
161                 
162                 return 0;
163         }
164         catch(exception& e) {
165                 cout << "Standard Error: " << e.what() << " has occurred in the ScreenSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
166                 exit(1);
167         }
168         catch(...) {
169                 cout << "An unknown error has occurred in the ScreenSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
170                 exit(1);
171         }
172         
173 }
174
175 //***************************************************************************************************************
176
177 void ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
178
179         ifstream inputNames;
180         openInputFile(namefile, inputNames);
181         set<string> badSeqGroups;
182         string seqName, seqList, group;
183         set<string>::iterator it;
184
185         string goodNameFile = getRootName(namefile) + "good" + getExtension(namefile);
186         string badNameFile = getRootName(namefile) + "bad" + getExtension(namefile);
187         
188         ofstream goodNameOut;   openOutputFile(goodNameFile, goodNameOut);
189         ofstream badNameOut;    openOutputFile(badNameFile, badNameOut);                
190         
191         while(!inputNames.eof()){
192                 inputNames >> seqName >> seqList;
193                 it = badSeqNames.find(seqName);
194                 
195                 if(it != badSeqNames.end()){
196                         badSeqNames.erase(it);
197                         badNameOut << seqName << '\t' << seqList << endl;
198                         if(namefile != ""){
199                                 int start = 0;
200                                 for(int i=0;i<seqList.length();i++){
201                                         if(seqList[i] == ','){
202                                                 badSeqGroups.insert(seqList.substr(start,i-start));
203                                                 start = i+1;
204                                         }                                       
205                                 }
206                                 badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
207                         }
208                 }
209                 else{
210                         goodNameOut << seqName << '\t' << seqList << endl;
211                 }
212                 gobble(inputNames);
213         }
214         inputNames.close();
215         goodNameOut.close();
216         badNameOut.close();
217         
218         if(groupfile != ""){
219                 
220                 ifstream inputGroups;
221                 openInputFile(groupfile, inputGroups);
222
223                 string goodGroupFile = getRootName(groupfile) + "good" + getExtension(groupfile);
224                 string badGroupFile = getRootName(groupfile) + "bad" + getExtension(groupfile);
225                 
226                 ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
227                 ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
228                 
229                 while(!inputGroups.eof()){
230                         inputGroups >> seqName >> group;
231
232                         it = badSeqGroups.find(seqName);
233                         
234                         if(it != badSeqGroups.end()){
235                                 badSeqGroups.erase(it);
236                                 badGroupOut << seqName << '\t' << group << endl;
237                         }
238                         else{
239                                 goodGroupOut << seqName << '\t' << group << endl;
240                         }
241                         gobble(inputGroups);
242                 }
243                 inputGroups.close();
244                 goodGroupOut.close();
245                 badGroupOut.close();
246         }
247 }
248
249 //***************************************************************************************************************
250
251 void ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
252
253         ifstream inputGroups;
254         openInputFile(groupfile, inputGroups);
255         string seqName, group;
256         set<string>::iterator it;
257         
258         string goodGroupFile = getRootName(groupfile) + "good" + getExtension(groupfile);
259         string badGroupFile = getRootName(groupfile) + "bad" + getExtension(groupfile);
260         
261         ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
262         ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
263         
264         while(!inputGroups.eof()){
265                 inputGroups >> seqName >> group;
266                 it = badSeqNames.find(seqName);
267                 
268                 if(it != badSeqNames.end()){
269                         badSeqNames.erase(it);
270                         badGroupOut << seqName << '\t' << group << endl;
271                 }
272                 else{
273                         goodGroupOut << seqName << '\t' << group << endl;
274                 }
275                 gobble(inputGroups);
276         }
277         inputGroups.close();
278         goodGroupOut.close();
279         badGroupOut.close();
280         
281 }
282
283 //***************************************************************************************************************
284
285