]> git.donarmstrong.com Git - mothur.git/blob - seqerrorcommand.cpp
changes to seqerrorcommand
[mothur.git] / seqerrorcommand.cpp
1 /*
2  *  seqerrorcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 7/15/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "seqerrorcommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> SeqErrorCommand::getValidParameters(){   
14         try {
15                 string Array[] =  {"query", "reference", "name", "threshold", "inputdir", "outputdir"};
16                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
17                 return myArray;
18         }
19         catch(exception& e) {
20                 m->errorOut(e, "SeqErrorCommand", "getValidParameters");
21                 exit(1);
22         }
23 }
24 //**********************************************************************************************************************
25 SeqErrorCommand::SeqErrorCommand(){     
26         try {
27                 abort = true;
28                 //initialize outputTypes
29                 vector<string> tempOutNames;
30                 outputTypes["error"] = tempOutNames;
31                 outputTypes["count"] = tempOutNames;
32         }
33         catch(exception& e) {
34                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
35                 exit(1);
36         }
37 }
38 //**********************************************************************************************************************
39 vector<string> SeqErrorCommand::getRequiredParameters(){        
40         try {
41                 string Array[] =  {"query","reference"};
42                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "SeqErrorCommand", "getRequiredParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 vector<string> SeqErrorCommand::getRequiredFiles(){     
52         try {
53                 vector<string> myArray;
54                 return myArray;
55         }
56         catch(exception& e) {
57                 m->errorOut(e, "SeqErrorCommand", "getRequiredFiles");
58                 exit(1);
59         }
60 }
61 //***************************************************************************************************************
62
63 SeqErrorCommand::SeqErrorCommand(string option)  {
64         try {
65                 
66                 abort = false;
67                 
68                 //allow user to run help
69                 if(option == "help") { help(); abort = true; }
70                 
71                 else {
72                         string temp;
73                         
74                         //valid paramters for this command
75                         string AlignArray[] =  {"query", "reference", "name", "threshold", "inputdir", "outputdir"};
76                         
77 //need to implement name file option
78                         
79                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
80                         
81                         OptionParser parser(option);
82                         map<string,string> parameters = parser.getParameters();
83                         
84                         ValidParameters validParameter;
85                         map<string,string>::iterator it;
86                         
87                         //check to make sure all parameters are valid for command
88                         for (it = parameters.begin(); it != parameters.end(); it++) { 
89                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
90                         }
91                         
92                         //initialize outputTypes
93                         vector<string> tempOutNames;
94                         outputTypes["error"] = tempOutNames;
95                         outputTypes["count"] = tempOutNames;
96                         
97                         //if the user changes the input directory command factory will send this info to us in the output parameter 
98                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
99                         if (inputDir == "not found"){   inputDir = "";          }
100                         else {
101                                 string path;
102                                 it = parameters.find("query");
103                                 //user has given a template file
104                                 if(it != parameters.end()){ 
105                                         path = m->hasPath(it->second);
106                                         //if the user has not given a path then, add inputdir. else leave path alone.
107                                         if (path == "") {       parameters["query"] = inputDir + it->second;            }
108                                 }
109                                 
110                                 it = parameters.find("reference");
111                                 //user has given a template file
112                                 if(it != parameters.end()){ 
113                                         path = m->hasPath(it->second);
114                                         //if the user has not given a path then, add inputdir. else leave path alone.
115                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
116                                 }
117                                 
118                                 it = parameters.find("name");
119                                 //user has given a template file
120                                 if(it != parameters.end()){ 
121                                         path = m->hasPath(it->second);
122                                         //if the user has not given a path then, add inputdir. else leave path alone.
123                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
124                                 }
125                                 
126                         }
127                         //check for required parameters
128                         queryFileName = validParameter.validFile(parameters, "query", true);
129                         if (queryFileName == "not found") { m->mothurOut("query is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
130                         else if (queryFileName == "not open") { abort = true; } 
131                         
132                         referenceFileName = validParameter.validFile(parameters, "reference", true);
133                         if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
134                         else if (referenceFileName == "not open") { abort = true; }     
135                         
136                         //if the user changes the output directory command factory will send this info to us in the output parameter 
137                         namesFileName = validParameter.validFile(parameters, "name", true);
138                         if(namesFileName == "not found"){       namesFileName = "";     }
139                         cout << namesFileName << endl;
140                         
141                         outputDir = validParameter.validFile(parameters, "outputdir", false);
142                         if (outputDir == "not found"){  
143                                 outputDir = ""; 
144                                 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it   
145                         }
146                         
147                         //check for optional parameter and set defaults
148                         // ...at some point should added some additional type checking...
149                         temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found") { temp = "1.00"; }
150                         convert(temp, threshold);  
151                                                 
152                         errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary";
153                         m->openOutputFile(errorSummaryFileName, errorSummaryFile);
154                         outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName);
155                         printErrorHeader();
156
157                         errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq";
158                         m->openOutputFile(errorSeqFileName, errorSeqFile);
159                         outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName);
160                         printErrorHeader();
161                         
162                 }
163         }
164         catch(exception& e) {
165                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
166                 exit(1);
167         }
168 }
169
170 //**********************************************************************************************************************
171
172 void SeqErrorCommand::help(){
173         try {
174                 m->mothurOut("The seq.error command reads a query alignment file and a reference alignment file and creates .....\n");
175                 
176                 
177                 
178                 m->mothurOut("Example seq.error(...).\n");
179                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
180                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n\n");
181         }
182         catch(exception& e) {
183                 m->errorOut(e, "SeqErrorCommand", "help");
184                 exit(1);
185         }
186 }
187
188 //***************************************************************************************************************
189
190 SeqErrorCommand::~SeqErrorCommand(){
191         errorSummaryFile.close();       
192         errorSeqFile.close();   
193 }
194
195 //***************************************************************************************************************
196
197 int SeqErrorCommand::execute(){
198         try{
199                 if (abort == true) { return 0; }
200
201                 getReferences();        //read in reference sequences - make sure there's no ambiguous bases
202
203                 map<string, int> weights;
204                 if(namesFileName != ""){        weights = getWeights(); }
205                 
206                 ifstream queryFile;
207                 m->openInputFile(queryFileName, queryFile);
208                                 
209                 int totalBases = 0;
210                 int totalMatches = 0;
211                 
212                 vector<int> misMatchCounts(11, 0);
213                 int maxMismatch = 0;
214                 int numSeqs = 0;
215                 
216                 map<string, int>::iterator it;
217                 
218                 while(queryFile){
219                         Compare minCompare;
220                         
221                         Sequence query(queryFile);
222                         
223                         for(int i=0;i<numRefs;i++){
224                                 Compare currCompare = getErrors(query, referenceSeqs[i]);
225                                 
226                                 if(currCompare.errorRate < minCompare.errorRate){
227                                         minCompare = currCompare;
228                                 }
229                                 
230                         }
231
232                         if(namesFileName != ""){
233                                 it = weights.find(query.getName());
234                                 minCompare.weight = it->second;
235                         }
236                         else {
237                                 minCompare.weight = 1;
238                         }
239
240                         printErrorData(minCompare);
241
242                         if(minCompare.errorRate < threshold){
243                                 totalBases += (minCompare.total * minCompare.weight);
244                                 totalMatches += minCompare.matches * minCompare.weight;
245                                 if(minCompare.mismatches > maxMismatch){
246                                         maxMismatch = minCompare.mismatches;
247                                         misMatchCounts.resize(maxMismatch + 1, 0);
248                                 }                               
249                                 misMatchCounts[minCompare.mismatches] += minCompare.weight;
250                                 numSeqs++;
251                         }
252                         
253                 }
254                 queryFile.close();
255                 
256                 int total = 0;
257                 
258                 
259                 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count";
260                 ofstream errorCountFile;
261                 m->openOutputFile(errorCountFileName, errorCountFile);
262                 outputNames.push_back(errorCountFileName);  outputTypes["error.count"].push_back(errorCountFileName);
263                 
264                 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n\n");
265                 m->mothurOut("Errors\tSequences\n");
266                 
267                 errorCountFile << "Errors\tSequences\n";
268                 
269                 for(int i=0;i<misMatchCounts.size();i++){
270                         m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
271                         errorCountFile << i << '\t' << misMatchCounts[i] << endl;
272                 }
273                 
274                 return 0;       
275         }
276         catch(exception& e) {
277                 m->errorOut(e, "SeqErrorCommand", "execute");
278                 exit(1);
279         }
280 }
281
282 //***************************************************************************************************************
283
284 void SeqErrorCommand::getReferences(){
285         try {
286                 
287                 ifstream referenceFile;
288                 m->openInputFile(referenceFileName, referenceFile);
289                 
290                 while(referenceFile){
291                         Sequence currentSeq(referenceFile);
292                         int numAmbigs = currentSeq.getAmbigBases();
293                         
294                         if(numAmbigs != 0){
295                                 m->mothurOut("Warning: " + toString(currentSeq.getName()) + " has " + toString(numAmbigs) + " ambiguous bases, these bases will be removed\n");
296                                 currentSeq.removeAmbigBases();
297                         }
298                         referenceSeqs.push_back(currentSeq);
299                         m->gobble(referenceFile);
300                 }
301                 numRefs = referenceSeqs.size();
302                 
303                 referenceFile.close();
304         }
305         catch(exception& e) {
306                 m->errorOut(e, "SeqErrorCommand", "getReferences");
307                 exit(1);
308         }
309 }
310
311 //***************************************************************************************************************
312
313 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
314         try {
315                 if(query.getAlignLength() != reference.getAlignLength()){
316                         m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
317                 }
318                 int alignLength = query.getAlignLength();
319         
320                 string q = query.getAligned();
321                 string r = reference.getAligned();
322
323                 
324                 int started = 0;
325                 Compare errors;
326
327                 for(int i=0;i<alignLength;i++){
328                         if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                 //      no missing data and no double gaps
329                                 started = 1;
330                                 
331                                 if(q[i] == 'A'){
332                                         if(r[i] == 'A'){        errors.AA++;    errors.matches++;       errors.sequence += 'm'; }
333                                         if(r[i] == 'T'){        errors.AT++;    errors.sequence += 's'; }
334                                         if(r[i] == 'G'){        errors.AG++;    errors.sequence += 's'; }
335                                         if(r[i] == 'C'){        errors.AC++;    errors.sequence += 's'; }
336                                         if(r[i] == '-'){        errors.Ai++;    errors.sequence += 'i'; }
337                                 }
338                                 else if(q[i] == 'T'){
339                                         if(r[i] == 'A'){        errors.TA++;    errors.sequence += 's'; }
340                                         if(r[i] == 'T'){        errors.TT++;    errors.matches++;       errors.sequence += 'm'; }
341                                         if(r[i] == 'G'){        errors.TG++;    errors.sequence += 's'; }
342                                         if(r[i] == 'C'){        errors.TC++;    errors.sequence += 's'; }
343                                         if(r[i] == '-'){        errors.Ti++;    errors.sequence += 'i'; }
344                                 }
345                                 else if(q[i] == 'G'){
346                                         if(r[i] == 'A'){        errors.GA++;    errors.sequence += 's'; }
347                                         if(r[i] == 'T'){        errors.GT++;    errors.sequence += 's'; }
348                                         if(r[i] == 'G'){        errors.GG++;    errors.matches++;       errors.sequence += 'm'; }
349                                         if(r[i] == 'C'){        errors.GC++;    errors.sequence += 's'; }
350                                         if(r[i] == '-'){        errors.Gi++;    errors.sequence += 'i'; }
351                                 }
352                                 else if(q[i] == 'C'){
353                                         if(r[i] == 'A'){        errors.CA++;    errors.sequence += 's'; }
354                                         if(r[i] == 'T'){        errors.CT++;    errors.sequence += 's'; }
355                                         if(r[i] == 'G'){        errors.CG++;    errors.sequence += 's'; }
356                                         if(r[i] == 'C'){        errors.CC++;    errors.matches++;       errors.sequence += 'm'; }
357                                         if(r[i] == '-'){        errors.Ci++;    errors.sequence += 'i'; }
358                                 }
359                                 else if(q[i] == 'N'){
360                                         if(r[i] == 'A'){        errors.NA++;    errors.sequence += 'a'; }
361                                         if(r[i] == 'T'){        errors.NT++;    errors.sequence += 'a'; }
362                                         if(r[i] == 'G'){        errors.NG++;    errors.sequence += 'a'; }
363                                         if(r[i] == 'C'){        errors.NC++;    errors.sequence += 'a'; }
364                                         if(r[i] == '-'){        errors.Ni++;    errors.sequence += 'a'; }
365                                 }
366                                 else if(q[i] == '-' && r[i] != '-'){
367                                         if(r[i] == 'A'){        errors.dA++;    errors.sequence += 'd'; }
368                                         if(r[i] == 'T'){        errors.dT++;    errors.sequence += 'd'; }
369                                         if(r[i] == 'G'){        errors.dG++;    errors.sequence += 'd'; }
370                                         if(r[i] == 'C'){        errors.dC++;    errors.sequence += 'd'; }
371                                 }
372                                 errors.total++; 
373                                 
374                         }
375                         else if(q[i] == '.' && r[i] != '.'){            //      reference extends beyond query
376                                 if(started == 1){       break;  }
377                         }
378                         else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
379                                 m->mothurOut("Warning: " + toString(query.getName()) + " extend beyond " + toString(reference.getName()) + ".  Ignoring the extra bases in the query\n");
380                                 if(started == 1){       break;  }
381                         }
382                         else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
383                                 if(started == 1){       break;  }                       
384                         }
385                         
386                 }
387                 errors.mismatches = errors.total-errors.matches;
388                 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
389                 errors.queryName = query.getName();
390                 errors.refName = reference.getName();
391                 
392                 return errors;
393         }
394         catch(exception& e) {
395                 m->errorOut(e, "SeqErrorCommand", "getErrors");
396                 exit(1);
397         }
398 }
399
400 //***************************************************************************************************************
401
402 map<string, int> SeqErrorCommand::getWeights(){
403         ifstream nameFile;
404         m->openInputFile(namesFileName, nameFile);
405         
406         string seqName;
407         string redundantSeqs;
408         map<string, int> nameCountMap;
409         
410         while(nameFile){
411                 nameFile >> seqName >> redundantSeqs;
412                 nameCountMap[seqName] = m->getNumNames(redundantSeqs); 
413                 m->gobble(nameFile);
414         }
415         return nameCountMap;
416 }
417
418
419 //***************************************************************************************************************
420
421 void SeqErrorCommand::printErrorHeader(){
422         try {
423                 errorSummaryFile << "query\treference\tweight\t";
424                 errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
425                 errorSummaryFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n";
426                 
427                 errorSummaryFile << setprecision(6);
428                 errorSummaryFile.setf(ios::fixed);
429         }
430         catch(exception& e) {
431                 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
432                 exit(1);
433         }
434 }
435
436 //***************************************************************************************************************
437
438 void SeqErrorCommand::printErrorData(Compare error){
439         try {
440                 errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
441                 errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
442                 errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
443                 errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
444                 errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
445                 errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
446                 errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ;
447                 errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
448                 
449                 errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                  //insertions
450                 errorSummaryFile << error.dA + error.dT + error.dG + error.dC << '\t';                  //deletions
451                 errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';      //substitutions
452                 errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';       //ambiguities
453                 errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl;
454                 
455                 errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl;
456         }
457         catch(exception& e) {
458                 m->errorOut(e, "SeqErrorCommand", "printErrorData");
459                 exit(1);
460         }
461 }
462
463 //***************************************************************************************************************
464
465
466
467
468
469
470