]> git.donarmstrong.com Git - mothur.git/blob - seqerrorcommand.cpp
pat's changes to seq.error command
[mothur.git] / seqerrorcommand.cpp
1 /*
2  *  seqerrorcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 7/15/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "seqerrorcommand.h"
11 #include "reportfile.h"
12 #include "qualityscores.h"
13
14 //**********************************************************************************************************************
15 vector<string> SeqErrorCommand::getValidParameters(){   
16         try {
17                 string Array[] =  {"query", "reference", "name", "qfile", "report", "threshold", "inputdir", "outputdir"};
18                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
19                 return myArray;
20         }
21         catch(exception& e) {
22                 m->errorOut(e, "SeqErrorCommand", "getValidParameters");
23                 exit(1);
24         }
25 }
26 //**********************************************************************************************************************
27 SeqErrorCommand::SeqErrorCommand(){     
28         try {
29                 abort = true;
30                 //initialize outputTypes
31                 vector<string> tempOutNames;
32                 outputTypes["error"] = tempOutNames;
33                 outputTypes["count"] = tempOutNames;
34         }
35         catch(exception& e) {
36                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
37                 exit(1);
38         }
39 }
40 //**********************************************************************************************************************
41 vector<string> SeqErrorCommand::getRequiredParameters(){        
42         try {
43                 string Array[] =  {"query","reference"};
44                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
45                 return myArray;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "SeqErrorCommand", "getRequiredParameters");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 vector<string> SeqErrorCommand::getRequiredFiles(){     
54         try {
55                 vector<string> myArray;
56                 return myArray;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "SeqErrorCommand", "getRequiredFiles");
60                 exit(1);
61         }
62 }
63 //***************************************************************************************************************
64
65 SeqErrorCommand::SeqErrorCommand(string option)  {
66         try {
67                 
68                 abort = false;
69                 
70                 //allow user to run help
71                 if(option == "help") { help(); abort = true; }
72                 
73                 else {
74                         string temp;
75                         
76                         //valid paramters for this command
77                         string AlignArray[] =  {"query", "reference", "name", "qfile", "report", "threshold", "inputdir", "outputdir"};
78                         
79 //need to implement name file option
80                         
81                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
82                         
83                         OptionParser parser(option);
84                         map<string,string> parameters = parser.getParameters();
85                         
86                         ValidParameters validParameter;
87                         map<string,string>::iterator it;
88                         
89                         //check to make sure all parameters are valid for command
90                         for (it = parameters.begin(); it != parameters.end(); it++) { 
91                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
92                         }
93                         
94                         //initialize outputTypes
95                         vector<string> tempOutNames;
96                         outputTypes["error"] = tempOutNames;
97                         outputTypes["count"] = tempOutNames;
98                         
99                         //if the user changes the input directory command factory will send this info to us in the output parameter 
100                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
101                         if (inputDir == "not found"){   inputDir = "";          }
102                         else {
103                                 string path;
104                                 it = parameters.find("query");
105                                 //user has given a template file
106                                 if(it != parameters.end()){ 
107                                         path = m->hasPath(it->second);
108                                         //if the user has not given a path then, add inputdir. else leave path alone.
109                                         if (path == "") {       parameters["query"] = inputDir + it->second;            }
110                                 }
111                                 
112                                 it = parameters.find("reference");
113                                 //user has given a template file
114                                 if(it != parameters.end()){ 
115                                         path = m->hasPath(it->second);
116                                         //if the user has not given a path then, add inputdir. else leave path alone.
117                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
118                                 }
119                                 
120                                 it = parameters.find("name");
121                                 //user has given a names file
122                                 if(it != parameters.end()){ 
123                                         path = m->hasPath(it->second);
124                                         //if the user has not given a path then, add inputdir. else leave path alone.
125                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
126                                 }
127
128                                 it = parameters.find("qfile");
129                                 //user has given a quality score file
130                                 if(it != parameters.end()){ 
131                                         path = m->hasPath(it->second);
132                                         //if the user has not given a path then, add inputdir. else leave path alone.
133                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
134                                 }
135                                 
136                                 it = parameters.find("report");
137                                 //user has given a alignment report file
138                                 if(it != parameters.end()){ 
139                                         path = m->hasPath(it->second);
140                                         //if the user has not given a path then, add inputdir. else leave path alone.
141                                         if (path == "") {       parameters["report"] = inputDir + it->second;           }
142                                 }
143                                 
144                         }
145                         //check for required parameters
146                         queryFileName = validParameter.validFile(parameters, "query", true);
147                         if (queryFileName == "not found") { m->mothurOut("query is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
148                         else if (queryFileName == "not open") { abort = true; } 
149                         
150                         referenceFileName = validParameter.validFile(parameters, "reference", true);
151                         if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
152                         else if (referenceFileName == "not open") { abort = true; }     
153                         
154
155                         //check for optional parameters
156                         namesFileName = validParameter.validFile(parameters, "name", true);
157                         if(namesFileName == "not found"){       namesFileName = "";     }
158                         
159                         qualFileName = validParameter.validFile(parameters, "qfile", true);
160                         if(qualFileName == "not found"){        qualFileName = "";      }
161
162                         reportFileName = validParameter.validFile(parameters, "report", true);
163                         if(reportFileName == "not found"){      reportFileName = "";    }
164                         
165                         if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){
166                                 m->mothurOut("if you use either a qual file or a report file, you have to have both.");
167                                 m->mothurOutEndLine();
168                                 abort = true; 
169                         }
170                         
171                         
172                         
173                         outputDir = validParameter.validFile(parameters, "outputdir", false);
174                         if (outputDir == "not found"){  
175                                 outputDir = ""; 
176                                 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it   
177                         }
178                         
179                         //check for optional parameter and set defaults
180                         // ...at some point should added some additional type checking...
181                         temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found") { temp = "1.00"; }
182                         convert(temp, threshold);  
183                                                 
184                         errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary";
185                         m->openOutputFile(errorSummaryFileName, errorSummaryFile);
186                         outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName);
187                         printErrorHeader();
188
189                         errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq";
190                         m->openOutputFile(errorSeqFileName, errorSeqFile);
191                         outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName);
192                         printErrorHeader();
193                         
194                 }
195         }
196         catch(exception& e) {
197                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
198                 exit(1);
199         }
200 }
201
202 //**********************************************************************************************************************
203
204 void SeqErrorCommand::help(){
205         try {
206                 m->mothurOut("The seq.error command reads a query alignment file and a reference alignment file and creates .....\n");
207                 
208                 
209                 
210                 m->mothurOut("Example seq.error(...).\n");
211                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
212                 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n\n");
213         }
214         catch(exception& e) {
215                 m->errorOut(e, "SeqErrorCommand", "help");
216                 exit(1);
217         }
218 }
219
220 //***************************************************************************************************************
221
222 SeqErrorCommand::~SeqErrorCommand(){
223         errorSummaryFile.close();       
224         errorSeqFile.close();   
225 }
226
227 //***************************************************************************************************************
228
229 int SeqErrorCommand::execute(){
230         try{
231                 if (abort == true) { return 0; }
232
233                 getReferences();        //read in reference sequences - make sure there's no ambiguous bases
234
235                 map<string, int> weights;
236                 if(namesFileName != ""){        weights = getWeights(); }
237                 
238                 ifstream queryFile;
239                 m->openInputFile(queryFileName, queryFile);
240                 
241                 ifstream reportFile;
242                 ifstream qualFile;
243
244                 ReportFile report;
245                 QualityScores quality;
246
247                 if(qualFileName != "" && reportFileName != ""){
248                         m->openInputFile(qualFileName, qualFile);
249                         report = ReportFile(reportFile, reportFileName);
250                 }
251                 
252                 int totalBases = 0;
253                 int totalMatches = 0;
254                 
255                 vector<int> misMatchCounts(11, 0);
256                 int maxMismatch = 0;
257                 int numSeqs = 0;
258                 
259                 map<string, int>::iterator it;
260                 map<char, vector<int> > qScoreErrorMap;
261                 qScoreErrorMap['m'].assign(41, 0);
262                 qScoreErrorMap['s'].assign(41, 0);
263                 qScoreErrorMap['i'].assign(41, 0);
264                 qScoreErrorMap['a'].assign(41, 0);
265                 
266                 while(queryFile){
267                         Compare minCompare;
268                         Sequence query(queryFile);
269                         
270                         for(int i=0;i<numRefs;i++){
271                                 Compare currCompare = getErrors(query, referenceSeqs[i]);
272                                 
273                                 if(currCompare.errorRate < minCompare.errorRate){
274                                         minCompare = currCompare;
275                                 }
276                         }
277
278                         if(namesFileName != ""){
279                                 it = weights.find(query.getName());
280                                 minCompare.weight = it->second;
281                         }
282                         else    {       minCompare.weight = 1;  }
283
284                         printErrorData(minCompare);
285
286                         if(qualFileName != "" && reportFileName != ""){
287                                 report = ReportFile(reportFile);
288                                 
289                                 int origLength = report.getQueryLength();
290                                 int startBase = report.getQueryStart();
291                                 int endBase = report.getQueryEnd();
292
293                                 quality = QualityScores(qualFile, origLength);
294                                 quality.updateQScoreErrorMap(qScoreErrorMap, minCompare.sequence, startBase, endBase, minCompare.weight);
295                         }                       
296                         
297                         if(minCompare.errorRate < threshold){
298                                 totalBases += (minCompare.total * minCompare.weight);
299                                 totalMatches += minCompare.matches * minCompare.weight;
300                                 if(minCompare.mismatches > maxMismatch){
301                                         maxMismatch = minCompare.mismatches;
302                                         misMatchCounts.resize(maxMismatch + 1, 0);
303                                 }                               
304                                 misMatchCounts[minCompare.mismatches] += minCompare.weight;
305                                 numSeqs++;
306                         }
307                         
308                         
309                 }
310                 queryFile.close();
311                 
312                 int total = 0;
313                 
314                 if(qualFileName != "" && reportFileName != ""){
315                         string errorQualityFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.quality";
316                         ofstream errorQualityFile;
317                         m->openOutputFile(errorQualityFileName, errorQualityFile);
318                         outputNames.push_back(errorQualityFileName);  outputTypes["error.quality"].push_back(errorQualityFileName);
319                         
320                         errorQualityFile << "qscore\tmatches\tsubstitutions\tinsertions\tambiguous" << endl;
321                         for(int i=0;i<41;i++){
322                                 errorQualityFile << i << '\t' << qScoreErrorMap['m'][i] << '\t' << qScoreErrorMap['s'][i] << '\t' << qScoreErrorMap['i'][i] << '\t'<< qScoreErrorMap['a'][i] << endl;
323                         }
324                 }
325                 
326                 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count";
327                 ofstream errorCountFile;
328                 m->openOutputFile(errorCountFileName, errorCountFile);
329                 outputNames.push_back(errorCountFileName);  outputTypes["error.count"].push_back(errorCountFileName);
330                 
331                 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n\n");
332                 m->mothurOut("Errors\tSequences\n");
333                 
334                 errorCountFile << "Errors\tSequences\n";
335                 
336                 for(int i=0;i<misMatchCounts.size();i++){
337                         m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
338                         errorCountFile << i << '\t' << misMatchCounts[i] << endl;
339                 }
340                 
341                 return 0;       
342         }
343         catch(exception& e) {
344                 m->errorOut(e, "SeqErrorCommand", "execute");
345                 exit(1);
346         }
347 }
348
349 //***************************************************************************************************************
350
351 void SeqErrorCommand::getReferences(){
352         try {
353                 
354                 ifstream referenceFile;
355                 m->openInputFile(referenceFileName, referenceFile);
356                 
357                 while(referenceFile){
358                         Sequence currentSeq(referenceFile);
359                         int numAmbigs = currentSeq.getAmbigBases();
360                         
361                         if(numAmbigs != 0){
362                                 m->mothurOut("Warning: " + toString(currentSeq.getName()) + " has " + toString(numAmbigs) + " ambiguous bases, these bases will be removed\n");
363                                 currentSeq.removeAmbigBases();
364                         }
365                         referenceSeqs.push_back(currentSeq);
366                         m->gobble(referenceFile);
367                 }
368                 numRefs = referenceSeqs.size();
369                 
370                 referenceFile.close();
371         }
372         catch(exception& e) {
373                 m->errorOut(e, "SeqErrorCommand", "getReferences");
374                 exit(1);
375         }
376 }
377
378 //***************************************************************************************************************
379
380 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
381         try {
382                 if(query.getAlignLength() != reference.getAlignLength()){
383                         m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
384                 }
385                 int alignLength = query.getAlignLength();
386         
387                 string q = query.getAligned();
388                 string r = reference.getAligned();
389
390                 
391                 int started = 0;
392                 Compare errors;
393
394                 for(int i=0;i<alignLength;i++){
395                         if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                 //      no missing data and no double gaps
396                                 started = 1;
397                                 
398                                 if(q[i] == 'A'){
399                                         if(r[i] == 'A'){        errors.AA++;    errors.matches++;       errors.sequence += 'm'; }
400                                         if(r[i] == 'T'){        errors.AT++;    errors.sequence += 's'; }
401                                         if(r[i] == 'G'){        errors.AG++;    errors.sequence += 's'; }
402                                         if(r[i] == 'C'){        errors.AC++;    errors.sequence += 's'; }
403                                         if(r[i] == '-'){        errors.Ai++;    errors.sequence += 'i'; }
404                                 }
405                                 else if(q[i] == 'T'){
406                                         if(r[i] == 'A'){        errors.TA++;    errors.sequence += 's'; }
407                                         if(r[i] == 'T'){        errors.TT++;    errors.matches++;       errors.sequence += 'm'; }
408                                         if(r[i] == 'G'){        errors.TG++;    errors.sequence += 's'; }
409                                         if(r[i] == 'C'){        errors.TC++;    errors.sequence += 's'; }
410                                         if(r[i] == '-'){        errors.Ti++;    errors.sequence += 'i'; }
411                                 }
412                                 else if(q[i] == 'G'){
413                                         if(r[i] == 'A'){        errors.GA++;    errors.sequence += 's'; }
414                                         if(r[i] == 'T'){        errors.GT++;    errors.sequence += 's'; }
415                                         if(r[i] == 'G'){        errors.GG++;    errors.matches++;       errors.sequence += 'm'; }
416                                         if(r[i] == 'C'){        errors.GC++;    errors.sequence += 's'; }
417                                         if(r[i] == '-'){        errors.Gi++;    errors.sequence += 'i'; }
418                                 }
419                                 else if(q[i] == 'C'){
420                                         if(r[i] == 'A'){        errors.CA++;    errors.sequence += 's'; }
421                                         if(r[i] == 'T'){        errors.CT++;    errors.sequence += 's'; }
422                                         if(r[i] == 'G'){        errors.CG++;    errors.sequence += 's'; }
423                                         if(r[i] == 'C'){        errors.CC++;    errors.matches++;       errors.sequence += 'm'; }
424                                         if(r[i] == '-'){        errors.Ci++;    errors.sequence += 'i'; }
425                                 }
426                                 else if(q[i] == 'N'){
427                                         if(r[i] == 'A'){        errors.NA++;    errors.sequence += 'a'; }
428                                         if(r[i] == 'T'){        errors.NT++;    errors.sequence += 'a'; }
429                                         if(r[i] == 'G'){        errors.NG++;    errors.sequence += 'a'; }
430                                         if(r[i] == 'C'){        errors.NC++;    errors.sequence += 'a'; }
431                                         if(r[i] == '-'){        errors.Ni++;    errors.sequence += 'a'; }
432                                 }
433                                 else if(q[i] == '-' && r[i] != '-'){
434                                         if(r[i] == 'A'){        errors.dA++;    errors.sequence += 'd'; }
435                                         if(r[i] == 'T'){        errors.dT++;    errors.sequence += 'd'; }
436                                         if(r[i] == 'G'){        errors.dG++;    errors.sequence += 'd'; }
437                                         if(r[i] == 'C'){        errors.dC++;    errors.sequence += 'd'; }
438                                 }
439                                 errors.total++; 
440                                 
441                         }
442                         else if(q[i] == '.' && r[i] != '.'){            //      reference extends beyond query
443                                 if(started == 1){       break;  }
444                         }
445                         else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
446                                 m->mothurOut("Warning: " + toString(query.getName()) + " extend beyond " + toString(reference.getName()) + ".  Ignoring the extra bases in the query\n");
447                                 if(started == 1){       break;  }
448                         }
449                         else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
450                                 if(started == 1){       break;  }                       
451                         }
452                         
453                 }
454                 errors.mismatches = errors.total-errors.matches;
455                 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
456                 errors.queryName = query.getName();
457                 errors.refName = reference.getName();
458                 
459                 return errors;
460         }
461         catch(exception& e) {
462                 m->errorOut(e, "SeqErrorCommand", "getErrors");
463                 exit(1);
464         }
465 }
466
467 //***************************************************************************************************************
468
469 map<string, int> SeqErrorCommand::getWeights(){
470         ifstream nameFile;
471         m->openInputFile(namesFileName, nameFile);
472         
473         string seqName;
474         string redundantSeqs;
475         map<string, int> nameCountMap;
476         
477         while(nameFile){
478                 nameFile >> seqName >> redundantSeqs;
479                 nameCountMap[seqName] = m->getNumNames(redundantSeqs); 
480                 m->gobble(nameFile);
481         }
482         return nameCountMap;
483 }
484
485
486 //***************************************************************************************************************
487
488 void SeqErrorCommand::printErrorHeader(){
489         try {
490                 errorSummaryFile << "query\treference\tweight\t";
491                 errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
492                 errorSummaryFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\n";
493                 
494                 errorSummaryFile << setprecision(6);
495                 errorSummaryFile.setf(ios::fixed);
496         }
497         catch(exception& e) {
498                 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
499                 exit(1);
500         }
501 }
502
503 //***************************************************************************************************************
504
505 void SeqErrorCommand::printErrorData(Compare error){
506         try {
507                 errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
508                 errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
509                 errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
510                 errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
511                 errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
512                 errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
513                 errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ;
514                 errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
515                 
516                 errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                  //insertions
517                 errorSummaryFile << error.dA + error.dT + error.dG + error.dC << '\t';                  //deletions
518                 errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';      //substitutions
519                 errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';       //ambiguities
520                 errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << endl;
521                 
522                 errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl;
523         }
524         catch(exception& e) {
525                 m->errorOut(e, "SeqErrorCommand", "printErrorData");
526                 exit(1);
527         }
528 }
529
530 //***************************************************************************************************************
531
532
533
534
535
536
537