]> git.donarmstrong.com Git - mothur.git/blob - seqerrorcommand.cpp
fixes while testing
[mothur.git] / seqerrorcommand.cpp
1 /*
2  *  seqerrorcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 7/15/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "seqerrorcommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> SeqErrorCommand::getValidParameters(){   
14         try {
15                 string Array[] =  {"query", "reference", "name", "threshold", "inputdir", "outputdir"};
16                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
17                 return myArray;
18         }
19         catch(exception& e) {
20                 m->errorOut(e, "SeqErrorCommand", "getValidParameters");
21                 exit(1);
22         }
23 }
24 //**********************************************************************************************************************
25 SeqErrorCommand::SeqErrorCommand(){     
26         try {
27                 abort = true;
28                 //initialize outputTypes
29                 vector<string> tempOutNames;
30                 outputTypes["error"] = tempOutNames;
31                 outputTypes["count"] = tempOutNames;
32         }
33         catch(exception& e) {
34                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
35                 exit(1);
36         }
37 }
38 //**********************************************************************************************************************
39 vector<string> SeqErrorCommand::getRequiredParameters(){        
40         try {
41                 string Array[] =  {"query","reference"};
42                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "SeqErrorCommand", "getRequiredParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 vector<string> SeqErrorCommand::getRequiredFiles(){     
52         try {
53                 vector<string> myArray;
54                 return myArray;
55         }
56         catch(exception& e) {
57                 m->errorOut(e, "SeqErrorCommand", "getRequiredFiles");
58                 exit(1);
59         }
60 }
61 //***************************************************************************************************************
62
63 SeqErrorCommand::SeqErrorCommand(string option)  {
64         try {
65                 
66                 abort = false;
67                 
68                 //allow user to run help
69                 if(option == "help") { help(); abort = true; }
70                 
71                 else {
72                         string temp;
73                         
74                         //valid paramters for this command
75                         string AlignArray[] =  {"query", "reference", "name", "threshold", "inputdir", "outputdir"};
76                         
77 //need to implement name file option
78                         
79                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
80                         
81                         OptionParser parser(option);
82                         map<string,string> parameters = parser.getParameters();
83                         
84                         ValidParameters validParameter;
85                         map<string,string>::iterator it;
86                         
87                         //check to make sure all parameters are valid for command
88                         for (it = parameters.begin(); it != parameters.end(); it++) { 
89                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
90                         }
91                         
92                         //initialize outputTypes
93                         vector<string> tempOutNames;
94                         outputTypes["error"] = tempOutNames;
95                         outputTypes["count"] = tempOutNames;
96                         
97                         //if the user changes the input directory command factory will send this info to us in the output parameter 
98                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
99                         if (inputDir == "not found"){   inputDir = "";          }
100                         else {
101                                 string path;
102                                 it = parameters.find("query");
103                                 //user has given a template file
104                                 if(it != parameters.end()){ 
105                                         path = m->hasPath(it->second);
106                                         //if the user has not given a path then, add inputdir. else leave path alone.
107                                         if (path == "") {       parameters["query"] = inputDir + it->second;            }
108                                 }
109                                 
110                                 it = parameters.find("reference");
111                                 //user has given a template file
112                                 if(it != parameters.end()){ 
113                                         path = m->hasPath(it->second);
114                                         //if the user has not given a path then, add inputdir. else leave path alone.
115                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
116                                 }
117                                 
118                                 it = parameters.find("name");
119                                 //user has given a template file
120                                 if(it != parameters.end()){ 
121                                         path = m->hasPath(it->second);
122                                         //if the user has not given a path then, add inputdir. else leave path alone.
123                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
124                                 }
125                                 
126                         }
127                         //check for required parameters
128                         queryFileName = validParameter.validFile(parameters, "query", true);
129                         if (queryFileName == "not found") { m->mothurOut("query is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
130                         else if (queryFileName == "not open") { abort = true; } 
131                         
132                         referenceFileName = validParameter.validFile(parameters, "reference", true);
133                         if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
134                         else if (referenceFileName == "not open") { abort = true; }     
135                         
136                         //if the user changes the output directory command factory will send this info to us in the output parameter 
137                         namesFileName = validParameter.validFile(parameters, "name", true);
138                         if(namesFileName == "not found"){       namesFileName = "";     }
139                         cout << namesFileName << endl;
140                         
141                         outputDir = validParameter.validFile(parameters, "outputdir", false);
142                         if (outputDir == "not found"){  
143                                 outputDir = ""; 
144                                 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it   
145                         }
146                         
147                         //check for optional parameter and set defaults
148                         // ...at some point should added some additional type checking...
149                         temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found") { temp = "1.00"; }
150                         convert(temp, threshold);  
151                                                 
152                         errorFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".errors";
153                         m->openOutputFile(errorFileName, errorFile);
154                         outputNames.push_back(errorFileName); outputTypes["error"].push_back(errorFileName);
155                         printErrorHeader();
156                 }
157         }
158         catch(exception& e) {
159                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
160                 exit(1);
161         }
162 }
163
164 //**********************************************************************************************************************
165
166 void SeqErrorCommand::help(){
167         try {
168                 m->mothurOut("The seq.error command reads a query alignment file and a reference alignment file and creates .....\n");
169                 
170                 
171                 
172                 m->mothurOut("Example seq.error(...).\n");
173                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
174                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n\n");
175         }
176         catch(exception& e) {
177                 m->errorOut(e, "SeqErrorCommand", "help");
178                 exit(1);
179         }
180 }
181
182 //***************************************************************************************************************
183
184 SeqErrorCommand::~SeqErrorCommand(){    errorFile.close();      }
185
186 //***************************************************************************************************************
187
188 int SeqErrorCommand::execute(){
189         try{
190                 if (abort == true) { return 0; }
191
192                 getReferences();        //read in reference sequences - make sure there's no ambiguous bases
193
194                 map<string, int> weights;
195                 if(namesFileName != ""){        weights = getWeights(); }
196                 
197                 ifstream queryFile;
198                 m->openInputFile(queryFileName, queryFile);
199                                 
200                 int totalBases = 0;
201                 int totalMatches = 0;
202                 
203                 vector<int> misMatchCounts(11, 0);
204                 int maxMismatch = 0;
205                 int numSeqs = 0;
206                 
207                 map<string, int>::iterator it;
208                 
209                 while(queryFile){
210                         Compare minCompare;
211                         
212                         Sequence query(queryFile);
213                         
214                         for(int i=0;i<numRefs;i++){
215                                 Compare currCompare = getErrors(query, referenceSeqs[i]);
216                                 
217                                 if(currCompare.errorRate < minCompare.errorRate){
218                                         minCompare = currCompare;
219                                 }
220                                 
221                         }
222
223                         if(namesFileName != ""){
224                                 it = weights.find(query.getName());
225                                 minCompare.weight = it->second;
226                         }
227                         else {
228                                 minCompare.weight = 1;
229                         }
230
231                         printErrorData(minCompare);
232
233                         if(minCompare.errorRate < threshold){
234                                 totalBases += (minCompare.total * minCompare.weight);
235                                 totalMatches += minCompare.matches * minCompare.weight;
236                                 if(minCompare.mismatches > maxMismatch){
237                                         maxMismatch = minCompare.mismatches;
238                                         misMatchCounts.resize(maxMismatch + 1, 0);
239                                 }                               
240                                 misMatchCounts[minCompare.mismatches] += minCompare.weight;
241                                 numSeqs++;
242                         }
243                         
244                 }
245                 queryFile.close();
246                 
247                 int total = 0;
248                 
249                 
250                 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".count";
251                 ofstream errorCountFile;
252                 m->openOutputFile(errorCountFileName, errorCountFile);
253                 outputNames.push_back(errorCountFileName);  outputTypes["count"].push_back(errorCountFileName);
254                 
255                 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n\n");
256                 m->mothurOut("Errors\tSequences\n");
257                 
258                 errorCountFile << "Errors\tSequences\n";
259                 
260                 for(int i=0;i<misMatchCounts.size();i++){
261                         m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
262                         errorCountFile << i << '\t' << misMatchCounts[i] << endl;
263                 }
264                 
265                 return 0;       
266         }
267         catch(exception& e) {
268                 m->errorOut(e, "SeqErrorCommand", "execute");
269                 exit(1);
270         }
271 }
272
273 //***************************************************************************************************************
274
275 void SeqErrorCommand::getReferences(){
276         try {
277                 
278                 ifstream referenceFile;
279                 m->openInputFile(referenceFileName, referenceFile);
280                 
281                 while(referenceFile){
282                         Sequence currentSeq(referenceFile);
283                         int numAmbigs = currentSeq.getAmbigBases();
284                         
285                         if(numAmbigs != 0){
286                                 m->mothurOut("Warning: " + toString(currentSeq.getName()) + " has " + toString(numAmbigs) + " ambiguous bases, these bases will be removed\n");
287                                 currentSeq.removeAmbigBases();
288                         }
289                         referenceSeqs.push_back(currentSeq);
290                         m->gobble(referenceFile);
291                 }
292                 numRefs = referenceSeqs.size();
293                 
294                 referenceFile.close();
295         }
296         catch(exception& e) {
297                 m->errorOut(e, "SeqErrorCommand", "getReferences");
298                 exit(1);
299         }
300 }
301
302 //***************************************************************************************************************
303
304 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
305         try {
306                 if(query.getAlignLength() != reference.getAlignLength()){
307                         m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
308                 }
309                 int alignLength = query.getAlignLength();
310         
311                 string q = query.getAligned();
312                 string r = reference.getAligned();
313
314                 int started = 0;
315                 Compare errors;
316
317                 for(int i=0;i<alignLength;i++){
318                         if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                 //      no missing data and no double gaps
319                                 started = 1;
320                                 
321                                 if(q[i] == 'A'){
322                                         if(r[i] == 'A'){        errors.AA++;    errors.matches++;       }
323                                         if(r[i] == 'T'){        errors.AT++;    }
324                                         if(r[i] == 'G'){        errors.AG++;    }
325                                         if(r[i] == 'C'){        errors.AC++;    }
326                                         if(r[i] == '-'){        errors.Ai++;    }
327                                 }
328                                 else if(q[i] == 'T'){
329                                         if(r[i] == 'A'){        errors.TA++;    }
330                                         if(r[i] == 'T'){        errors.TT++;    errors.matches++;       }
331                                         if(r[i] == 'G'){        errors.TG++;    }
332                                         if(r[i] == 'C'){        errors.TC++;    }
333                                         if(r[i] == '-'){        errors.Ti++;    }
334                                 }
335                                 else if(q[i] == 'G'){
336                                         if(r[i] == 'A'){        errors.GA++;    }
337                                         if(r[i] == 'T'){        errors.GT++;    }
338                                         if(r[i] == 'G'){        errors.GG++;    errors.matches++;       }
339                                         if(r[i] == 'C'){        errors.GC++;    }
340                                         if(r[i] == '-'){        errors.Gi++;    }
341                                 }
342                                 else if(q[i] == 'C'){
343                                         if(r[i] == 'A'){        errors.CA++;    }
344                                         if(r[i] == 'T'){        errors.CT++;    }
345                                         if(r[i] == 'G'){        errors.CG++;    }
346                                         if(r[i] == 'C'){        errors.CC++;    errors.matches++;       }
347                                         if(r[i] == '-'){        errors.Ci++;    }
348                                 }
349                                 else if(q[i] == 'N'){
350                                         if(r[i] == 'A'){        errors.NA++;    }
351                                         if(r[i] == 'T'){        errors.NT++;    }
352                                         if(r[i] == 'G'){        errors.NG++;    }
353                                         if(r[i] == 'C'){        errors.NC++;    }
354                                         if(r[i] == '-'){        errors.Ni++;    }
355                                 }
356                                 else if(q[i] == '-' && r[i] != '-'){
357                                         if(r[i] == 'A'){        errors.dA++;    }
358                                         if(r[i] == 'T'){        errors.dT++;    }
359                                         if(r[i] == 'G'){        errors.dG++;    }
360                                         if(r[i] == 'C'){        errors.dC++;    }
361                                 }
362                                 errors.total++; 
363                                 
364                         }
365                         else if(q[i] == '.' && r[i] != '.'){            //      reference extends beyond query
366                                 if(started == 1){       break;  }
367                         }
368                         else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
369                                 m->mothurOut("Warning: " + toString(query.getName()) + " extend beyond " + toString(reference.getName()) + ".  Ignoring the extra bases in the query\n");
370                                 if(started == 1){       break;  }
371                         }
372                         else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
373                                 if(started == 1){       break;  }                       
374                         }
375                         
376                 }
377                 errors.mismatches = errors.total-errors.matches;
378                 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
379                 errors.queryName = query.getName();
380                 errors.refName = reference.getName();
381                 
382                 return errors;
383         }
384         catch(exception& e) {
385                 m->errorOut(e, "SeqErrorCommand", "getErrors");
386                 exit(1);
387         }
388 }
389
390 //***************************************************************************************************************
391
392 map<string, int> SeqErrorCommand::getWeights(){
393         ifstream nameFile;
394         m->openInputFile(namesFileName, nameFile);
395         
396         string seqName;
397         string redundantSeqs;
398         map<string, int> nameCountMap;
399         
400         while(nameFile){
401                 nameFile >> seqName >> redundantSeqs;
402                 nameCountMap[seqName] = m->getNumNames(redundantSeqs); 
403                 m->gobble(nameFile);
404         }
405         return nameCountMap;
406 }
407
408
409 //***************************************************************************************************************
410
411 void SeqErrorCommand::printErrorHeader(){
412         try {
413                 errorFile << "query\treference\tweight\t";
414                 errorFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
415                 errorFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n";
416                 
417                 errorFile << setprecision(6);
418                 errorFile.setf(ios::fixed);
419         }
420         catch(exception& e) {
421                 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
422                 exit(1);
423         }
424 }
425
426 //***************************************************************************************************************
427
428 void SeqErrorCommand::printErrorData(Compare error){
429         try {
430                 errorFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
431                 errorFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
432                 errorFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
433                 errorFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
434                 errorFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
435                 errorFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
436                 errorFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ;
437                 errorFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
438                 
439                 errorFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                 //insertions
440                 errorFile << error.dA + error.dT + error.dG + error.dC << '\t';                 //deletions
441                 errorFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';     //substitutions
442                 errorFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';      //ambiguities
443                 errorFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl;
444                 
445         }
446         catch(exception& e) {
447                 m->errorOut(e, "SeqErrorCommand", "printErrorData");
448                 exit(1);
449         }
450 }
451
452 //***************************************************************************************************************
453
454
455
456
457
458
459