]> git.donarmstrong.com Git - mothur.git/blob - seqerrorcommand.cpp
added pipeline commands which involved change to command factory and command class...
[mothur.git] / seqerrorcommand.cpp
1 /*
2  *  seqerrorcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 7/15/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "seqerrorcommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> SeqErrorCommand::getValidParameters(){   
14         try {
15                 string Array[] =  {"query", "reference", "name", "threshold", "inputdir", "outputdir"};
16                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
17                 return myArray;
18         }
19         catch(exception& e) {
20                 m->errorOut(e, "SeqErrorCommand", "getValidParameters");
21                 exit(1);
22         }
23 }
24 //**********************************************************************************************************************
25 SeqErrorCommand::SeqErrorCommand(){     
26         try {
27                 //initialize outputTypes
28                 vector<string> tempOutNames;
29                 outputTypes["error"] = tempOutNames;
30                 outputTypes["count"] = tempOutNames;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 vector<string> SeqErrorCommand::getRequiredParameters(){        
39         try {
40                 string Array[] =  {"query","reference"};
41                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
42                 return myArray;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "SeqErrorCommand", "getRequiredParameters");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50 vector<string> SeqErrorCommand::getRequiredFiles(){     
51         try {
52                 vector<string> myArray;
53                 return myArray;
54         }
55         catch(exception& e) {
56                 m->errorOut(e, "SeqErrorCommand", "getRequiredFiles");
57                 exit(1);
58         }
59 }
60 //***************************************************************************************************************
61
62 SeqErrorCommand::SeqErrorCommand(string option)  {
63         try {
64                 
65                 abort = false;
66                 
67                 //allow user to run help
68                 if(option == "help") { help(); abort = true; }
69                 
70                 else {
71                         string temp;
72                         
73                         //valid paramters for this command
74                         string AlignArray[] =  {"query", "reference", "name", "threshold", "inputdir", "outputdir"};
75                         
76 //need to implement name file option
77                         
78                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
79                         
80                         OptionParser parser(option);
81                         map<string,string> parameters = parser.getParameters();
82                         
83                         ValidParameters validParameter;
84                         map<string,string>::iterator it;
85                         
86                         //check to make sure all parameters are valid for command
87                         for (it = parameters.begin(); it != parameters.end(); it++) { 
88                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
89                         }
90                         
91                         //initialize outputTypes
92                         vector<string> tempOutNames;
93                         outputTypes["error"] = tempOutNames;
94                         outputTypes["count"] = tempOutNames;
95                         
96                         //if the user changes the input directory command factory will send this info to us in the output parameter 
97                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
98                         if (inputDir == "not found"){   inputDir = "";          }
99                         else {
100                                 string path;
101                                 it = parameters.find("query");
102                                 //user has given a template file
103                                 if(it != parameters.end()){ 
104                                         path = m->hasPath(it->second);
105                                         //if the user has not given a path then, add inputdir. else leave path alone.
106                                         if (path == "") {       parameters["query"] = inputDir + it->second;            }
107                                 }
108                                 
109                                 it = parameters.find("reference");
110                                 //user has given a template file
111                                 if(it != parameters.end()){ 
112                                         path = m->hasPath(it->second);
113                                         //if the user has not given a path then, add inputdir. else leave path alone.
114                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
115                                 }
116                                 
117                                 it = parameters.find("name");
118                                 //user has given a template file
119                                 if(it != parameters.end()){ 
120                                         path = m->hasPath(it->second);
121                                         //if the user has not given a path then, add inputdir. else leave path alone.
122                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
123                                 }
124                                 
125                         }
126                         //check for required parameters
127                         queryFileName = validParameter.validFile(parameters, "query", true);
128                         if (queryFileName == "not found") { m->mothurOut("query is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
129                         else if (queryFileName == "not open") { abort = true; } 
130                         
131                         referenceFileName = validParameter.validFile(parameters, "reference", true);
132                         if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
133                         else if (referenceFileName == "not open") { abort = true; }     
134                         
135                         //if the user changes the output directory command factory will send this info to us in the output parameter 
136                         namesFileName = validParameter.validFile(parameters, "name", true);
137                         if(namesFileName == "not found"){       namesFileName = "";     }
138                         cout << namesFileName << endl;
139                         
140                         outputDir = validParameter.validFile(parameters, "outputdir", false);
141                         if (outputDir == "not found"){  
142                                 outputDir = ""; 
143                                 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it   
144                         }
145                         
146                         //check for optional parameter and set defaults
147                         // ...at some point should added some additional type checking...
148                         temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found") { temp = "1.00"; }
149                         convert(temp, threshold);  
150                                                 
151                         errorFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".errors";
152                         m->openOutputFile(errorFileName, errorFile);
153                         outputNames.push_back(errorFileName); outputTypes["error"].push_back(errorFileName);
154                         printErrorHeader();
155                 }
156         }
157         catch(exception& e) {
158                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
159                 exit(1);
160         }
161 }
162
163 //**********************************************************************************************************************
164
165 void SeqErrorCommand::help(){
166         try {
167                 m->mothurOut("The seq.error command reads a query alignment file and a reference alignment file and creates .....\n");
168                 
169                 
170                 
171                 m->mothurOut("Example seq.error(...).\n");
172                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
173                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n\n");
174         }
175         catch(exception& e) {
176                 m->errorOut(e, "SeqErrorCommand", "help");
177                 exit(1);
178         }
179 }
180
181 //***************************************************************************************************************
182
183 SeqErrorCommand::~SeqErrorCommand(){    errorFile.close();      }
184
185 //***************************************************************************************************************
186
187 int SeqErrorCommand::execute(){
188         try{
189                 if (abort == true) { return 0; }
190
191                 getReferences();        //read in reference sequences - make sure there's no ambiguous bases
192
193                 map<string, int> weights;
194                 if(namesFileName != ""){        weights = getWeights(); }
195                 
196                 ifstream queryFile;
197                 m->openInputFile(queryFileName, queryFile);
198                                 
199                 int totalBases = 0;
200                 int totalMatches = 0;
201                 
202                 vector<int> misMatchCounts(11, 0);
203                 int maxMismatch = 0;
204                 int numSeqs = 0;
205                 
206                 map<string, int>::iterator it;
207                 
208                 while(queryFile){
209                         Compare minCompare;
210                         
211                         Sequence query(queryFile);
212                         
213                         for(int i=0;i<numRefs;i++){
214                                 Compare currCompare = getErrors(query, referenceSeqs[i]);
215                                 
216                                 if(currCompare.errorRate < minCompare.errorRate){
217                                         minCompare = currCompare;
218                                 }
219                                 
220                         }
221
222                         if(namesFileName != ""){
223                                 it = weights.find(query.getName());
224                                 minCompare.weight = it->second;
225                         }
226                         else {
227                                 minCompare.weight = 1;
228                         }
229
230                         printErrorData(minCompare);
231
232                         if(minCompare.errorRate < threshold){
233                                 totalBases += (minCompare.total * minCompare.weight);
234                                 totalMatches += minCompare.matches * minCompare.weight;
235                                 if(minCompare.mismatches > maxMismatch){
236                                         maxMismatch = minCompare.mismatches;
237                                         misMatchCounts.resize(maxMismatch + 1, 0);
238                                 }                               
239                                 misMatchCounts[minCompare.mismatches] += minCompare.weight;
240                                 numSeqs++;
241                         }
242                         
243                 }
244                 queryFile.close();
245                 
246                 int total = 0;
247                 
248                 
249                 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".count";
250                 ofstream errorCountFile;
251                 m->openOutputFile(errorCountFileName, errorCountFile);
252                 outputNames.push_back(errorCountFileName);  outputTypes["count"].push_back(errorCountFileName);
253                 
254                 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n\n");
255                 m->mothurOut("Errors\tSequences\n");
256                 
257                 errorCountFile << "Errors\tSequences\n";
258                 
259                 for(int i=0;i<misMatchCounts.size();i++){
260                         m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
261                         errorCountFile << i << '\t' << misMatchCounts[i] << endl;
262                 }
263                 
264                 return 0;       
265         }
266         catch(exception& e) {
267                 m->errorOut(e, "SeqErrorCommand", "execute");
268                 exit(1);
269         }
270 }
271
272 //***************************************************************************************************************
273
274 void SeqErrorCommand::getReferences(){
275         try {
276                 
277                 ifstream referenceFile;
278                 m->openInputFile(referenceFileName, referenceFile);
279                 
280                 while(referenceFile){
281                         Sequence currentSeq(referenceFile);
282                         int numAmbigs = currentSeq.getAmbigBases();
283                         
284                         if(numAmbigs != 0){
285                                 m->mothurOut("Warning: " + toString(currentSeq.getName()) + " has " + toString(numAmbigs) + " ambiguous bases, these bases will be removed\n");
286                                 currentSeq.removeAmbigBases();
287                         }
288                         referenceSeqs.push_back(currentSeq);
289                         m->gobble(referenceFile);
290                 }
291                 numRefs = referenceSeqs.size();
292                 
293                 referenceFile.close();
294         }
295         catch(exception& e) {
296                 m->errorOut(e, "SeqErrorCommand", "getReferences");
297                 exit(1);
298         }
299 }
300
301 //***************************************************************************************************************
302
303 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
304         try {
305                 if(query.getAlignLength() != reference.getAlignLength()){
306                         m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
307                 }
308                 int alignLength = query.getAlignLength();
309         
310                 string q = query.getAligned();
311                 string r = reference.getAligned();
312
313                 int started = 0;
314                 Compare errors;
315
316                 for(int i=0;i<alignLength;i++){
317                         if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                 //      no missing data and no double gaps
318                                 started = 1;
319                                 
320                                 if(q[i] == 'A'){
321                                         if(r[i] == 'A'){        errors.AA++;    errors.matches++;       }
322                                         if(r[i] == 'T'){        errors.AT++;    }
323                                         if(r[i] == 'G'){        errors.AG++;    }
324                                         if(r[i] == 'C'){        errors.AC++;    }
325                                         if(r[i] == '-'){        errors.Ai++;    }
326                                 }
327                                 else if(q[i] == 'T'){
328                                         if(r[i] == 'A'){        errors.TA++;    }
329                                         if(r[i] == 'T'){        errors.TT++;    errors.matches++;       }
330                                         if(r[i] == 'G'){        errors.TG++;    }
331                                         if(r[i] == 'C'){        errors.TC++;    }
332                                         if(r[i] == '-'){        errors.Ti++;    }
333                                 }
334                                 else if(q[i] == 'G'){
335                                         if(r[i] == 'A'){        errors.GA++;    }
336                                         if(r[i] == 'T'){        errors.GT++;    }
337                                         if(r[i] == 'G'){        errors.GG++;    errors.matches++;       }
338                                         if(r[i] == 'C'){        errors.GC++;    }
339                                         if(r[i] == '-'){        errors.Gi++;    }
340                                 }
341                                 else if(q[i] == 'C'){
342                                         if(r[i] == 'A'){        errors.CA++;    }
343                                         if(r[i] == 'T'){        errors.CT++;    }
344                                         if(r[i] == 'G'){        errors.CG++;    }
345                                         if(r[i] == 'C'){        errors.CC++;    errors.matches++;       }
346                                         if(r[i] == '-'){        errors.Ci++;    }
347                                 }
348                                 else if(q[i] == 'N'){
349                                         if(r[i] == 'A'){        errors.NA++;    }
350                                         if(r[i] == 'T'){        errors.NT++;    }
351                                         if(r[i] == 'G'){        errors.NG++;    }
352                                         if(r[i] == 'C'){        errors.NC++;    }
353                                         if(r[i] == '-'){        errors.Ni++;    }
354                                 }
355                                 else if(q[i] == '-' && r[i] != '-'){
356                                         if(r[i] == 'A'){        errors.dA++;    }
357                                         if(r[i] == 'T'){        errors.dT++;    }
358                                         if(r[i] == 'G'){        errors.dG++;    }
359                                         if(r[i] == 'C'){        errors.dC++;    }
360                                 }
361                                 errors.total++; 
362                                 
363                         }
364                         else if(q[i] == '.' && r[i] != '.'){            //      reference extends beyond query
365                                 if(started == 1){       break;  }
366                         }
367                         else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
368                                 m->mothurOut("Warning: " + toString(query.getName()) + " extend beyond " + toString(reference.getName()) + ".  Ignoring the extra bases in the query\n");
369                                 if(started == 1){       break;  }
370                         }
371                         else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
372                                 if(started == 1){       break;  }                       
373                         }
374                         
375                 }
376                 errors.mismatches = errors.total-errors.matches;
377                 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
378                 errors.queryName = query.getName();
379                 errors.refName = reference.getName();
380                 
381                 return errors;
382         }
383         catch(exception& e) {
384                 m->errorOut(e, "SeqErrorCommand", "getErrors");
385                 exit(1);
386         }
387 }
388
389 //***************************************************************************************************************
390
391 map<string, int> SeqErrorCommand::getWeights(){
392         ifstream nameFile;
393         m->openInputFile(namesFileName, nameFile);
394         
395         string seqName;
396         string redundantSeqs;
397         map<string, int> nameCountMap;
398         
399         while(nameFile){
400                 nameFile >> seqName >> redundantSeqs;
401                 nameCountMap[seqName] = m->getNumNames(redundantSeqs); 
402                 m->gobble(nameFile);
403         }
404         return nameCountMap;
405 }
406
407
408 //***************************************************************************************************************
409
410 void SeqErrorCommand::printErrorHeader(){
411         try {
412                 errorFile << "query\treference\tweight\t";
413                 errorFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
414                 errorFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n";
415                 
416                 errorFile << setprecision(6);
417                 errorFile.setf(ios::fixed);
418         }
419         catch(exception& e) {
420                 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
421                 exit(1);
422         }
423 }
424
425 //***************************************************************************************************************
426
427 void SeqErrorCommand::printErrorData(Compare error){
428         try {
429                 errorFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
430                 errorFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
431                 errorFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
432                 errorFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
433                 errorFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
434                 errorFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
435                 errorFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ;
436                 errorFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
437                 
438                 errorFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                 //insertions
439                 errorFile << error.dA + error.dT + error.dG + error.dC << '\t';                 //deletions
440                 errorFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';     //substitutions
441                 errorFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';      //ambiguities
442                 errorFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl;
443                 
444         }
445         catch(exception& e) {
446                 m->errorOut(e, "SeqErrorCommand", "printErrorData");
447                 exit(1);
448         }
449 }
450
451 //***************************************************************************************************************
452
453
454
455
456
457
458