]> git.donarmstrong.com Git - mothur.git/blob - seqerrorcommand.cpp
moved utilities out of mothur.h and into mothurOut class.
[mothur.git] / seqerrorcommand.cpp
1 /*
2  *  seqerrorcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 7/15/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "seqerrorcommand.h"
11
12 //***************************************************************************************************************
13
14 SeqErrorCommand::SeqErrorCommand(string option)  {
15         try {
16                 
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         string temp;
24                         
25                         //valid paramters for this command
26                         string AlignArray[] =  {"query", "reference", "name", "threshold"};
27                         
28 //need to implement name file option
29                         
30                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
31                         
32                         OptionParser parser(option);
33                         map<string,string> parameters = parser.getParameters();
34                         
35                         ValidParameters validParameter;
36                         map<string,string>::iterator it;
37                         
38                         //check to make sure all parameters are valid for command
39                         for (it = parameters.begin(); it != parameters.end(); it++) { 
40                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
41                         }
42                         
43                         //if the user changes the input directory command factory will send this info to us in the output parameter 
44                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
45                         if (inputDir == "not found"){   inputDir = "";          }
46                         else {
47                                 string path;
48                                 it = parameters.find("query");
49                                 //user has given a template file
50                                 if(it != parameters.end()){ 
51                                         path = m->hasPath(it->second);
52                                         //if the user has not given a path then, add inputdir. else leave path alone.
53                                         if (path == "") {       parameters["query"] = inputDir + it->second;            }
54                                 }
55                                 
56                                 it = parameters.find("reference");
57                                 //user has given a template file
58                                 if(it != parameters.end()){ 
59                                         path = m->hasPath(it->second);
60                                         //if the user has not given a path then, add inputdir. else leave path alone.
61                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
62                                 }
63                                 
64                                 it = parameters.find("name");
65                                 //user has given a template file
66                                 if(it != parameters.end()){ 
67                                         path = m->hasPath(it->second);
68                                         //if the user has not given a path then, add inputdir. else leave path alone.
69                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
70                                 }
71                                 
72                         }
73                         //check for required parameters
74                         queryFileName = validParameter.validFile(parameters, "query", true);
75                         if (queryFileName == "not found") { m->mothurOut("query is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
76                         else if (queryFileName == "not open") { abort = true; } 
77                         
78                         referenceFileName = validParameter.validFile(parameters, "reference", true);
79                         if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
80                         else if (referenceFileName == "not open") { abort = true; }     
81                         
82                         //if the user changes the output directory command factory will send this info to us in the output parameter 
83                         namesFileName = validParameter.validFile(parameters, "name", true);
84                         if(namesFileName == "not found"){       namesFileName = "";     }
85                         cout << namesFileName << endl;
86                         
87                         outputDir = validParameter.validFile(parameters, "outputdir", false);
88                         if (outputDir == "not found"){  
89                                 outputDir = ""; 
90                                 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it   
91                         }
92                         
93                         //check for optional parameter and set defaults
94                         // ...at some point should added some additional type checking...
95                         temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found") { temp = "1.00"; }
96                         convert(temp, threshold);  
97                                                 
98                         errorFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".errors";
99                         m->openOutputFile(errorFileName, errorFile);
100                         printErrorHeader();
101                 }
102         }
103         catch(exception& e) {
104                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
105                 exit(1);
106         }
107 }
108
109 //**********************************************************************************************************************
110
111 void SeqErrorCommand::help(){
112         try {
113                 m->mothurOut("The seq.error command reads a query alignment file and a reference alignment file and creates .....\n");
114                 
115                 
116                 
117                 m->mothurOut("Example seq.error(...).\n");
118                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
119                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n\n");
120         }
121         catch(exception& e) {
122                 m->errorOut(e, "SeqErrorCommand", "help");
123                 exit(1);
124         }
125 }
126
127 //***************************************************************************************************************
128
129 SeqErrorCommand::~SeqErrorCommand(){    errorFile.close();      }
130
131 //***************************************************************************************************************
132
133 int SeqErrorCommand::execute(){
134         try{
135                 if (abort == true) { return 0; }
136
137                 getReferences();        //read in reference sequences - make sure there's no ambiguous bases
138
139                 map<string, int> weights;
140                 if(namesFileName != ""){        weights = getWeights(); }
141                 
142                 ifstream queryFile;
143                 m->openInputFile(queryFileName, queryFile);
144                                 
145                 int totalBases = 0;
146                 int totalMatches = 0;
147                 
148                 vector<int> misMatchCounts(11, 0);
149                 int maxMismatch = 0;
150                 int numSeqs = 0;
151                 
152                 map<string, int>::iterator it;
153                 
154                 while(queryFile){
155                         Compare minCompare;
156                         
157                         Sequence query(queryFile);
158                         
159                         for(int i=0;i<numRefs;i++){
160                                 Compare currCompare = getErrors(query, referenceSeqs[i]);
161                                 
162                                 if(currCompare.errorRate < minCompare.errorRate){
163                                         minCompare = currCompare;
164                                 }
165                                 
166                         }
167
168                         if(namesFileName != ""){
169                                 it = weights.find(query.getName());
170                                 minCompare.weight = it->second;
171                         }
172                         else {
173                                 minCompare.weight = 1;
174                         }
175
176                         printErrorData(minCompare);
177
178                         if(minCompare.errorRate < threshold){
179                                 totalBases += (minCompare.total * minCompare.weight);
180                                 totalMatches += minCompare.matches * minCompare.weight;
181                                 if(minCompare.mismatches > maxMismatch){
182                                         maxMismatch = minCompare.mismatches;
183                                         misMatchCounts.resize(maxMismatch + 1, 0);
184                                 }                               
185                                 misMatchCounts[minCompare.mismatches] += minCompare.weight;
186                                 numSeqs++;
187                         }
188                         
189                 }
190                 queryFile.close();
191                 
192                 int total = 0;
193                 
194                 
195                 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".count";
196                 ofstream errorCountFile;
197                 m->openOutputFile(errorCountFileName, errorCountFile);
198                 
199                 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n\n");
200                 m->mothurOut("Errors\tSequences\n");
201                 
202                 errorCountFile << "Errors\tSequences\n";
203                 
204                 for(int i=0;i<misMatchCounts.size();i++){
205                         m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
206                         errorCountFile << i << '\t' << misMatchCounts[i] << endl;
207                 }
208                 
209                 return 0;       
210         }
211         catch(exception& e) {
212                 m->errorOut(e, "SeqErrorCommand", "execute");
213                 exit(1);
214         }
215 }
216
217 //***************************************************************************************************************
218
219 void SeqErrorCommand::getReferences(){
220         try {
221                 
222                 ifstream referenceFile;
223                 m->openInputFile(referenceFileName, referenceFile);
224                 
225                 while(referenceFile){
226                         Sequence currentSeq(referenceFile);
227                         int numAmbigs = currentSeq.getAmbigBases();
228                         
229                         if(numAmbigs != 0){
230                                 m->mothurOut("Warning: " + toString(currentSeq.getName()) + " has " + toString(numAmbigs) + " ambiguous bases, these bases will be removed\n");
231                                 currentSeq.removeAmbigBases();
232                         }
233                         referenceSeqs.push_back(currentSeq);
234                         m->gobble(referenceFile);
235                 }
236                 numRefs = referenceSeqs.size();
237                 
238                 referenceFile.close();
239         }
240         catch(exception& e) {
241                 m->errorOut(e, "SeqErrorCommand", "getReferences");
242                 exit(1);
243         }
244 }
245
246 //***************************************************************************************************************
247
248 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
249         try {
250                 if(query.getAlignLength() != reference.getAlignLength()){
251                         m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
252                 }
253                 int alignLength = query.getAlignLength();
254         
255                 string q = query.getAligned();
256                 string r = reference.getAligned();
257
258                 int started = 0;
259                 Compare errors;
260
261                 for(int i=0;i<alignLength;i++){
262                         if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                 //      no missing data and no double gaps
263                                 started = 1;
264                                 
265                                 if(q[i] == 'A'){
266                                         if(r[i] == 'A'){        errors.AA++;    errors.matches++;       }
267                                         if(r[i] == 'T'){        errors.AT++;    }
268                                         if(r[i] == 'G'){        errors.AG++;    }
269                                         if(r[i] == 'C'){        errors.AC++;    }
270                                         if(r[i] == '-'){        errors.Ai++;    }
271                                 }
272                                 else if(q[i] == 'T'){
273                                         if(r[i] == 'A'){        errors.TA++;    }
274                                         if(r[i] == 'T'){        errors.TT++;    errors.matches++;       }
275                                         if(r[i] == 'G'){        errors.TG++;    }
276                                         if(r[i] == 'C'){        errors.TC++;    }
277                                         if(r[i] == '-'){        errors.Ti++;    }
278                                 }
279                                 else if(q[i] == 'G'){
280                                         if(r[i] == 'A'){        errors.GA++;    }
281                                         if(r[i] == 'T'){        errors.GT++;    }
282                                         if(r[i] == 'G'){        errors.GG++;    errors.matches++;       }
283                                         if(r[i] == 'C'){        errors.GC++;    }
284                                         if(r[i] == '-'){        errors.Gi++;    }
285                                 }
286                                 else if(q[i] == 'C'){
287                                         if(r[i] == 'A'){        errors.CA++;    }
288                                         if(r[i] == 'T'){        errors.CT++;    }
289                                         if(r[i] == 'G'){        errors.CG++;    }
290                                         if(r[i] == 'C'){        errors.CC++;    errors.matches++;       }
291                                         if(r[i] == '-'){        errors.Ci++;    }
292                                 }
293                                 else if(q[i] == 'N'){
294                                         if(r[i] == 'A'){        errors.NA++;    }
295                                         if(r[i] == 'T'){        errors.NT++;    }
296                                         if(r[i] == 'G'){        errors.NG++;    }
297                                         if(r[i] == 'C'){        errors.NC++;    }
298                                         if(r[i] == '-'){        errors.Ni++;    }
299                                 }
300                                 else if(q[i] == '-' && r[i] != '-'){
301                                         if(r[i] == 'A'){        errors.dA++;    }
302                                         if(r[i] == 'T'){        errors.dT++;    }
303                                         if(r[i] == 'G'){        errors.dG++;    }
304                                         if(r[i] == 'C'){        errors.dC++;    }
305                                 }
306                                 errors.total++; 
307                                 
308                         }
309                         else if(q[i] == '.' && r[i] != '.'){            //      reference extends beyond query
310                                 if(started == 1){       break;  }
311                         }
312                         else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
313                                 m->mothurOut("Warning: " + toString(query.getName()) + " extend beyond " + toString(reference.getName()) + ".  Ignoring the extra bases in the query\n");
314                                 if(started == 1){       break;  }
315                         }
316                         else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
317                                 if(started == 1){       break;  }                       
318                         }
319                         
320                 }
321                 errors.mismatches = errors.total-errors.matches;
322                 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
323                 errors.queryName = query.getName();
324                 errors.refName = reference.getName();
325                 
326                 return errors;
327         }
328         catch(exception& e) {
329                 m->errorOut(e, "SeqErrorCommand", "getErrors");
330                 exit(1);
331         }
332 }
333
334 //***************************************************************************************************************
335
336 map<string, int> SeqErrorCommand::getWeights(){
337         ifstream nameFile;
338         m->openInputFile(namesFileName, nameFile);
339         
340         string seqName;
341         string redundantSeqs;
342         map<string, int> nameCountMap;
343         
344         while(nameFile){
345                 nameFile >> seqName >> redundantSeqs;
346                 nameCountMap[seqName] = m->getNumNames(redundantSeqs); 
347                 m->gobble(nameFile);
348         }
349         return nameCountMap;
350 }
351
352
353 //***************************************************************************************************************
354
355 void SeqErrorCommand::printErrorHeader(){
356         try {
357                 errorFile << "query\treference\tweight\t";
358                 errorFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
359                 errorFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n";
360                 
361                 errorFile << setprecision(6);
362                 errorFile.setf(ios::fixed);
363         }
364         catch(exception& e) {
365                 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
366                 exit(1);
367         }
368 }
369
370 //***************************************************************************************************************
371
372 void SeqErrorCommand::printErrorData(Compare error){
373         try {
374                 errorFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
375                 errorFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
376                 errorFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
377                 errorFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
378                 errorFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
379                 errorFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
380                 errorFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ;
381                 errorFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
382                 
383                 errorFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                 //insertions
384                 errorFile << error.dA + error.dT + error.dG + error.dC << '\t';                 //deletions
385                 errorFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';     //substitutions
386                 errorFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';      //ambiguities
387                 errorFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl;
388                 
389         }
390         catch(exception& e) {
391                 m->errorOut(e, "SeqErrorCommand", "printErrorData");
392                 exit(1);
393         }
394 }
395
396 //***************************************************************************************************************
397
398
399
400
401
402
403