]> git.donarmstrong.com Git - mothur.git/blob - seqerrorcommand.cpp
finished adding the names option to trim.seqs
[mothur.git] / seqerrorcommand.cpp
1 /*
2  *  seqerrorcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 7/15/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "seqerrorcommand.h"
11 #include "reportfile.h"
12 #include "qualityscores.h"
13 #include "refchimeratest.h"
14
15 //**********************************************************************************************************************
16 vector<string> SeqErrorCommand::setParameters(){        
17         try {
18                 CommandParameter pquery("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pquery);
19                 CommandParameter preference("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(preference);
20                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(pqfile);
21                 CommandParameter preport("report", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(preport);
22                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
23                 CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pignorechimeras);
24                 CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pthreshold);
25                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
26                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
28                 
29                 vector<string> myArray;
30                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
31                 return myArray;
32         }
33         catch(exception& e) {
34                 m->errorOut(e, "SeqErrorCommand", "setParameters");
35                 exit(1);
36         }
37 }
38 //**********************************************************************************************************************
39 string SeqErrorCommand::getHelpString(){        
40         try {
41                 string helpString = "";
42                 helpString += "The seq.error command reads a query alignment file and a reference alignment file and creates .....\n";
43                 helpString += "Example seq.error(...).\n";
44                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
45                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "SeqErrorCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 SeqErrorCommand::SeqErrorCommand(){     
55         try {
56                 abort = true; calledHelp = true; 
57                 vector<string> tempOutNames;
58                 outputTypes["error.summary"] = tempOutNames;
59                 outputTypes["error.seq"] = tempOutNames;
60                 outputTypes["error.quality"] = tempOutNames;
61                 outputTypes["error.qual.forward"] = tempOutNames;
62                 outputTypes["error.qual.reverse"] = tempOutNames;
63                 outputTypes["error.forward"] = tempOutNames;
64                 outputTypes["error.reverse"] = tempOutNames;
65                 outputTypes["error.count"] = tempOutNames;
66                 outputTypes["error.matrix"] = tempOutNames;
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
70                 exit(1);
71         }
72 }
73 //***************************************************************************************************************
74
75 SeqErrorCommand::SeqErrorCommand(string option)  {
76         try {
77                 
78                 abort = false; calledHelp = false;   
79                 
80                 //allow user to run help
81                 if(option == "help") { help(); abort = true; calledHelp = true; }
82                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
83                 
84                 else {
85                         string temp;
86                         vector<string> myArray = setParameters();
87                         
88                         OptionParser parser(option);
89                         map<string,string> parameters = parser.getParameters();
90                         
91                         ValidParameters validParameter;
92                         map<string,string>::iterator it;
93                         
94                         //check to make sure all parameters are valid for command
95                         for (it = parameters.begin(); it != parameters.end(); it++) { 
96                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
97                         }
98                         
99                         //initialize outputTypes
100                         vector<string> tempOutNames;
101                         outputTypes["error.summary"] = tempOutNames;
102                         outputTypes["error.seq"] = tempOutNames;
103                         outputTypes["error.quality"] = tempOutNames;
104                         outputTypes["error.qual.forward"] = tempOutNames;
105                         outputTypes["error.qual.reverse"] = tempOutNames;
106                         outputTypes["error.forward"] = tempOutNames;
107                         outputTypes["error.reverse"] = tempOutNames;
108                         outputTypes["error.count"] = tempOutNames;
109                         outputTypes["error.matrix"] = tempOutNames;
110
111                         
112                         //if the user changes the input directory command factory will send this info to us in the output parameter 
113                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
114                         if (inputDir == "not found"){   inputDir = "";          }
115                         else {
116                                 string path;
117                                 it = parameters.find("fasta");
118                                 //user has given a template file
119                                 if(it != parameters.end()){ 
120                                         path = m->hasPath(it->second);
121                                         //if the user has not given a path then, add inputdir. else leave path alone.
122                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
123                                 }
124                                 
125                                 it = parameters.find("reference");
126                                 //user has given a template file
127                                 if(it != parameters.end()){ 
128                                         path = m->hasPath(it->second);
129                                         //if the user has not given a path then, add inputdir. else leave path alone.
130                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
131                                 }
132                                 
133                                 it = parameters.find("name");
134                                 //user has given a names file
135                                 if(it != parameters.end()){ 
136                                         path = m->hasPath(it->second);
137                                         //if the user has not given a path then, add inputdir. else leave path alone.
138                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
139                                 }
140
141                                 it = parameters.find("qfile");
142                                 //user has given a quality score file
143                                 if(it != parameters.end()){ 
144                                         path = m->hasPath(it->second);
145                                         //if the user has not given a path then, add inputdir. else leave path alone.
146                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
147                                 }
148                                 
149                                 it = parameters.find("report");
150                                 //user has given a alignment report file
151                                 if(it != parameters.end()){ 
152                                         path = m->hasPath(it->second);
153                                         //if the user has not given a path then, add inputdir. else leave path alone.
154                                         if (path == "") {       parameters["report"] = inputDir + it->second;           }
155                                 }
156                                 
157                         }
158                         //check for required parameters
159                         queryFileName = validParameter.validFile(parameters, "fasta", true);
160                         if (queryFileName == "not found") { 
161                                 queryFileName = m->getFastaFile(); 
162                                 if (queryFileName != "") { m->mothurOut("Using " + queryFileName + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
163                                 else {  m->mothurOut("You have no current fasta file and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
164                         }
165                         else if (queryFileName == "not open") { abort = true; } 
166                         
167                         referenceFileName = validParameter.validFile(parameters, "reference", true);
168                         if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
169                         else if (referenceFileName == "not open") { abort = true; }     
170                         
171
172                         //check for optional parameters
173                         namesFileName = validParameter.validFile(parameters, "name", true);
174                         if(namesFileName == "not found"){       namesFileName = "";     }
175                         else if (namesFileName == "not open") { namesFileName = ""; abort = true; }     
176                         
177                         qualFileName = validParameter.validFile(parameters, "qfile", true);
178                         if(qualFileName == "not found"){        qualFileName = "";      }
179                         else if (qualFileName == "not open") { qualFileName = ""; abort = true; }       
180
181                         reportFileName = validParameter.validFile(parameters, "report", true);
182                         if(reportFileName == "not found"){      reportFileName = "";    }
183                         else if (reportFileName == "not open") { reportFileName = ""; abort = true; }   
184                         
185                         if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){
186                                 m->mothurOut("if you use either a qual file or a report file, you have to have both.");
187                                 m->mothurOutEndLine();
188                                 abort = true; 
189                         }
190                         
191                         outputDir = validParameter.validFile(parameters, "outputdir", false);
192                         if (outputDir == "not found"){  
193                                 outputDir = ""; 
194                                 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it   
195                         }
196                         
197                         //check for optional parameter and set defaults
198                         // ...at some point should added some additional type checking...
199                         temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found") { temp = "1.00"; }
200                         convert(temp, threshold);  
201                         
202                         temp = validParameter.validFile(parameters, "ignorechimeras", false);   if (temp == "not found") { temp = "1"; }
203                         convert(temp, ignoreChimeras);  
204                         
205                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
206                         m->setProcessors(temp);
207                         convert(temp, processors); 
208
209                         substitutionMatrix.resize(6);
210                         for(int i=0;i<6;i++){   substitutionMatrix[i].resize(6,0);      }
211                 }
212         }
213         catch(exception& e) {
214                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
215                 exit(1);
216         }
217 }
218 //***************************************************************************************************************
219
220 int SeqErrorCommand::execute(){
221         try{
222                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
223
224                 maxLength = 2000;
225                 
226                 string errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary";
227                 m->openOutputFile(errorSummaryFileName, errorSummaryFile);
228                 outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName);
229                 printErrorHeader();
230                 
231                 string errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq";
232                 m->openOutputFile(errorSeqFileName, errorSeqFile);
233                 outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName);
234
235                 getReferences();        //read in reference sequences - make sure there's no ambiguous bases
236
237                 map<string, int> weights;
238                 if(namesFileName != ""){        weights = getWeights(); }
239                 
240                 ifstream queryFile;
241                 m->openInputFile(queryFileName, queryFile);
242                 
243                 ifstream reportFile;
244                 ifstream qualFile;
245
246                 ReportFile report;
247                 QualityScores quality;
248                 vector<vector<int> > qualForwardMap;
249                 vector<vector<int> > qualReverseMap;
250                 
251                 if(qualFileName != "" && reportFileName != ""){
252                         m->openInputFile(qualFileName, qualFile);
253                         report = ReportFile(reportFile, reportFileName);
254                         
255                         qualForwardMap.resize(maxLength);
256                         qualReverseMap.resize(maxLength);
257                         for(int i=0;i<maxLength;i++){
258                                 qualForwardMap[i].assign(41,0);
259                                 qualReverseMap[i].assign(41,0);
260                         }                               
261                 }
262                 
263                 int totalBases = 0;
264                 int totalMatches = 0;
265                 
266                 vector<int> misMatchCounts(11, 0);
267                 int maxMismatch = 0;
268                 int numSeqs = 0;
269                 
270                 map<string, int>::iterator it;
271                 map<char, vector<int> > qScoreErrorMap;
272                 qScoreErrorMap['m'].assign(41, 0);
273                 qScoreErrorMap['s'].assign(41, 0);
274                 qScoreErrorMap['i'].assign(41, 0);
275                 qScoreErrorMap['a'].assign(41, 0);
276                 
277                 map<char, vector<int> > errorForward;
278                 errorForward['m'].assign(maxLength,0);
279                 errorForward['s'].assign(maxLength,0);
280                 errorForward['i'].assign(maxLength,0);
281                 errorForward['d'].assign(maxLength,0);
282                 errorForward['a'].assign(maxLength,0);
283                 
284                 map<char, vector<int> > errorReverse;
285                 errorReverse['m'].assign(maxLength,0);
286                 errorReverse['s'].assign(maxLength,0);
287                 errorReverse['i'].assign(maxLength,0);
288                 errorReverse['d'].assign(maxLength,0);
289                 errorReverse['a'].assign(maxLength,0);  
290                 
291
292                 string errorChimeraFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.chimera";
293                 RefChimeraTest chimeraTest(referenceSeqs, errorChimeraFileName);
294                 outputNames.push_back(errorChimeraFileName); outputTypes["error.chimera"].push_back(errorChimeraFileName);
295                 
296                 vector<string> megaAlignVector(numRefs, "");
297
298                 int index = 0;
299                 bool ignoreSeq = 0;
300                 
301                 while(queryFile){
302
303                         if (m->control_pressed) { errorSummaryFile.close();     errorSeqFile.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
304                 
305                         Sequence query(queryFile);
306                         
307                         int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned());
308                         int closestRefIndex = chimeraTest.getClosestRefIndex();
309
310                         if(numParentSeqs > 1 && ignoreChimeras == 1)    {       ignoreSeq = 1;  }
311                         else                                                                                    {       ignoreSeq = 0;  }
312
313                         Compare minCompare = getErrors(query, referenceSeqs[closestRefIndex]);
314                         
315                         if(namesFileName != ""){
316                                 it = weights.find(query.getName());
317                                 minCompare.weight = it->second;
318                         }
319                         else{   minCompare.weight = 1;  }
320
321                         printErrorData(minCompare, numParentSeqs);
322                 
323                         if(!ignoreSeq){
324
325                                 for(int i=0;i<minCompare.sequence.length();i++){
326                                         char letter = minCompare.sequence[i];
327
328                                         errorForward[letter][i] += minCompare.weight;
329                                         errorReverse[letter][minCompare.total-i-1] += minCompare.weight;                                
330                                 }
331                         }
332
333                         if(qualFileName != "" && reportFileName != ""){
334                                 report = ReportFile(reportFile);
335                                 
336 //                              int origLength = report.getQueryLength();
337                                 int startBase = report.getQueryStart();
338                                 int endBase = report.getQueryEnd();
339
340                                 quality = QualityScores(qualFile);
341
342                                 if(!ignoreSeq){
343                                         quality.updateQScoreErrorMap(qScoreErrorMap, minCompare.sequence, startBase, endBase, minCompare.weight);
344                                         quality.updateForwardMap(qualForwardMap, startBase, endBase, minCompare.weight);
345                                         quality.updateReverseMap(qualReverseMap, startBase, endBase, minCompare.weight);
346                                 }
347                         }                       
348
349                         if(minCompare.errorRate < threshold && !ignoreSeq){
350                                 totalBases += (minCompare.total * minCompare.weight);
351                                 totalMatches += minCompare.matches * minCompare.weight;
352                                 if(minCompare.mismatches > maxMismatch){
353                                         maxMismatch = minCompare.mismatches;
354                                         misMatchCounts.resize(maxMismatch + 1, 0);
355                                 }                               
356                                 misMatchCounts[minCompare.mismatches] += minCompare.weight;
357                                 numSeqs++;
358                                 
359                                 megaAlignVector[closestRefIndex] += query.getInlineSeq() + '\n';
360                         }
361
362                         index++;
363                         
364                         if(index % 1000 == 0){  m->mothurOut(toString(index) + '\n');   }
365                 }
366                 queryFile.close();
367                 errorSummaryFile.close();       
368                 errorSeqFile.close();
369
370                 if(qualFileName != "" && reportFileName != ""){         
371                         printErrorQuality(qScoreErrorMap);
372                         printQualityFR(qualForwardMap, qualReverseMap);
373                 }
374                 
375                 printErrorFRFile(errorForward, errorReverse);
376                 
377                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
378
379                 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count";
380                 ofstream errorCountFile;
381                 m->openOutputFile(errorCountFileName, errorCountFile);
382                 outputNames.push_back(errorCountFileName);  outputTypes["error.count"].push_back(errorCountFileName);
383                 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n");
384                 m->mothurOut("Errors\tSequences\n");
385                 errorCountFile << "Errors\tSequences\n";                
386                 for(int i=0;i<misMatchCounts.size();i++){
387                         m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
388                         errorCountFile << i << '\t' << misMatchCounts[i] << endl;
389                 }
390                 errorCountFile.close();
391                 
392                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
393
394                 printSubMatrix();
395                                 
396                 string megAlignmentFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.ref-query";
397                 ofstream megAlignmentFile;
398                 m->openOutputFile(megAlignmentFileName, megAlignmentFile);
399                 outputNames.push_back(megAlignmentFileName);  outputTypes["error.ref-query"].push_back(megAlignmentFileName);
400                 
401                 for(int i=0;i<numRefs;i++){
402                         megAlignmentFile << referenceSeqs[i].getInlineSeq() << endl;
403                         megAlignmentFile << megaAlignVector[i] << endl;
404                 }
405                 
406                 
407                 m->mothurOutEndLine();
408                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
409                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
410                 m->mothurOutEndLine();
411                 
412                 return 0;       
413         }
414         catch(exception& e) {
415                 m->errorOut(e, "SeqErrorCommand", "execute");
416                 exit(1);
417         }
418 }
419
420 //***************************************************************************************************************
421
422 void SeqErrorCommand::getReferences(){
423         try {
424                 
425                 ifstream referenceFile;
426                 m->openInputFile(referenceFileName, referenceFile);
427                 
428                 int numAmbigSeqs = 0;
429                 
430                 int maxStartPos = 0;
431                 int minEndPos = 100000;
432                 
433                 while(referenceFile){
434                         Sequence currentSeq(referenceFile);
435                         int numAmbigs = currentSeq.getAmbigBases();
436                         if(numAmbigs > 0){      numAmbigSeqs++; }
437                         
438 //                      int startPos = currentSeq.getStartPos();
439 //                      if(startPos > maxStartPos)      {       maxStartPos = startPos; }
440 //
441 //                      int endPos = currentSeq.getEndPos();
442 //                      if(endPos < minEndPos)          {       minEndPos = endPos;             }
443                         referenceSeqs.push_back(currentSeq);
444                                 
445                         m->gobble(referenceFile);
446                 }
447                 referenceFile.close();
448                 numRefs = referenceSeqs.size();
449
450                 
451                 for(int i=0;i<numRefs;i++){
452                         referenceSeqs[i].padToPos(maxStartPos);
453                         referenceSeqs[i].padFromPos(minEndPos);
454                 }
455                 
456                 if(numAmbigSeqs != 0){
457                         m->mothurOut("Warning: " + toString(numAmbigSeqs) + " reference sequences have ambiguous bases, these bases will be ignored\n");
458                 }               
459                 
460         }
461         catch(exception& e) {
462                 m->errorOut(e, "SeqErrorCommand", "getReferences");
463                 exit(1);
464         }
465 }
466
467 //***************************************************************************************************************
468
469 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
470         try {
471                 if(query.getAlignLength() != reference.getAlignLength()){
472                         m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
473                 }
474                 int alignLength = query.getAlignLength();
475         
476                 string q = query.getAligned();
477                 string r = reference.getAligned();
478
479                 int started = 0;
480                 Compare errors;
481
482                 for(int i=0;i<alignLength;i++){
483                         if(r[i] != 'N' && q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                  //      no missing data and no double gaps
484                                 started = 1;
485                                 
486                                 if(q[i] == 'A'){
487                                         if(r[i] == 'A'){        errors.AA++;    errors.matches++;       errors.sequence += 'm'; }
488                                         if(r[i] == 'T'){        errors.AT++;    errors.sequence += 's'; }
489                                         if(r[i] == 'G'){        errors.AG++;    errors.sequence += 's'; }
490                                         if(r[i] == 'C'){        errors.AC++;    errors.sequence += 's'; }
491                                         if(r[i] == '-'){        errors.Ai++;    errors.sequence += 'i'; }
492                                 }
493                                 else if(q[i] == 'T'){
494                                         if(r[i] == 'A'){        errors.TA++;    errors.sequence += 's'; }
495                                         if(r[i] == 'T'){        errors.TT++;    errors.matches++;       errors.sequence += 'm'; }
496                                         if(r[i] == 'G'){        errors.TG++;    errors.sequence += 's'; }
497                                         if(r[i] == 'C'){        errors.TC++;    errors.sequence += 's'; }
498                                         if(r[i] == '-'){        errors.Ti++;    errors.sequence += 'i'; }
499                                 }
500                                 else if(q[i] == 'G'){
501                                         if(r[i] == 'A'){        errors.GA++;    errors.sequence += 's'; }
502                                         if(r[i] == 'T'){        errors.GT++;    errors.sequence += 's'; }
503                                         if(r[i] == 'G'){        errors.GG++;    errors.matches++;       errors.sequence += 'm'; }
504                                         if(r[i] == 'C'){        errors.GC++;    errors.sequence += 's'; }
505                                         if(r[i] == '-'){        errors.Gi++;    errors.sequence += 'i'; }
506                                 }
507                                 else if(q[i] == 'C'){
508                                         if(r[i] == 'A'){        errors.CA++;    errors.sequence += 's'; }
509                                         if(r[i] == 'T'){        errors.CT++;    errors.sequence += 's'; }
510                                         if(r[i] == 'G'){        errors.CG++;    errors.sequence += 's'; }
511                                         if(r[i] == 'C'){        errors.CC++;    errors.matches++;       errors.sequence += 'm'; }
512                                         if(r[i] == '-'){        errors.Ci++;    errors.sequence += 'i'; }
513                                 }
514                                 else if(q[i] == 'N'){
515                                         if(r[i] == 'A'){        errors.NA++;    errors.sequence += 'a'; }
516                                         if(r[i] == 'T'){        errors.NT++;    errors.sequence += 'a'; }
517                                         if(r[i] == 'G'){        errors.NG++;    errors.sequence += 'a'; }
518                                         if(r[i] == 'C'){        errors.NC++;    errors.sequence += 'a'; }
519                                         if(r[i] == '-'){        errors.Ni++;    errors.sequence += 'a'; }
520                                 }
521                                 else if(q[i] == '-' && r[i] != '-'){
522                                         if(r[i] == 'A'){        errors.dA++;    errors.sequence += 'd'; }
523                                         if(r[i] == 'T'){        errors.dT++;    errors.sequence += 'd'; }
524                                         if(r[i] == 'G'){        errors.dG++;    errors.sequence += 'd'; }
525                                         if(r[i] == 'C'){        errors.dC++;    errors.sequence += 'd'; }
526                                 }
527                                 errors.total++; 
528                                 
529                         }
530                         else if(q[i] == '.' && r[i] != '.'){            //      reference extends beyond query
531                                 if(started == 1){       break;  }
532                         }
533                         else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
534                                 if(started == 1){       break;  }
535                         }
536                         else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
537                                 if(started == 1){       break;  }                       
538                         }
539                         
540                 }
541                 errors.mismatches = errors.total-errors.matches;
542                 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
543                 errors.queryName = query.getName();
544                 errors.refName = reference.getName();
545                 
546                 return errors;
547         }
548         catch(exception& e) {
549                 m->errorOut(e, "SeqErrorCommand", "getErrors");
550                 exit(1);
551         }
552 }
553
554 //***************************************************************************************************************
555
556 map<string, int> SeqErrorCommand::getWeights(){
557         ifstream nameFile;
558         m->openInputFile(namesFileName, nameFile);
559         
560         string seqName;
561         string redundantSeqs;
562         map<string, int> nameCountMap;
563         
564         while(nameFile){
565                 nameFile >> seqName >> redundantSeqs;
566                 nameCountMap[seqName] = m->getNumNames(redundantSeqs); 
567                 m->gobble(nameFile);
568         }
569         return nameCountMap;
570 }
571
572
573 //***************************************************************************************************************
574
575 void SeqErrorCommand::printErrorHeader(){
576         try {
577                 errorSummaryFile << "query\treference\tweight\t";
578                 errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
579                 errorSummaryFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\tnumparents\n";
580                 
581                 errorSummaryFile << setprecision(6);
582                 errorSummaryFile.setf(ios::fixed);
583         }
584         catch(exception& e) {
585                 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
586                 exit(1);
587         }
588 }
589
590 //***************************************************************************************************************
591
592 void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs){
593         try {
594
595                 errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
596                 errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
597                 errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
598                 errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
599                 errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
600                 errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
601                 errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t';
602                 errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
603                 
604                 errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                  //insertions
605                 errorSummaryFile << error.dA + error.dT + error.dG + error.dC << '\t';                  //deletions
606                 errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';      //substitutions
607                 errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';       //ambiguities
608                 errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << '\t' << numParentSeqs << endl;
609
610                 errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl;
611                 
612                 int a=0;                int t=1;                int g=2;                int c=3;
613                 int gap=4;              int n=5;
614
615                 if(numParentSeqs == 1 || ignoreChimeras == 0){
616                         substitutionMatrix[a][a] += error.weight * error.AA;
617                         substitutionMatrix[a][t] += error.weight * error.TA;
618                         substitutionMatrix[a][g] += error.weight * error.GA;
619                         substitutionMatrix[a][c] += error.weight * error.CA;
620                         substitutionMatrix[a][gap] += error.weight * error.dA;
621                         substitutionMatrix[a][n] += error.weight * error.NA;
622                         
623                         substitutionMatrix[t][a] += error.weight * error.AT;
624                         substitutionMatrix[t][t] += error.weight * error.TT;
625                         substitutionMatrix[t][g] += error.weight * error.GT;
626                         substitutionMatrix[t][c] += error.weight * error.CT;
627                         substitutionMatrix[t][gap] += error.weight * error.dT;
628                         substitutionMatrix[t][n] += error.weight * error.NT;
629
630                         substitutionMatrix[g][a] += error.weight * error.AG;
631                         substitutionMatrix[g][t] += error.weight * error.TG;
632                         substitutionMatrix[g][g] += error.weight * error.GG;
633                         substitutionMatrix[g][c] += error.weight * error.CG;
634                         substitutionMatrix[g][gap] += error.weight * error.dG;
635                         substitutionMatrix[g][n] += error.weight * error.NG;
636
637                         substitutionMatrix[c][a] += error.weight * error.AC;
638                         substitutionMatrix[c][t] += error.weight * error.TC;
639                         substitutionMatrix[c][g] += error.weight * error.GC;
640                         substitutionMatrix[c][c] += error.weight * error.CC;
641                         substitutionMatrix[c][gap] += error.weight * error.dC;
642                         substitutionMatrix[c][n] += error.weight * error.NC;
643
644                         substitutionMatrix[gap][a] += error.weight * error.Ai;
645                         substitutionMatrix[gap][t] += error.weight * error.Ti;
646                         substitutionMatrix[gap][g] += error.weight * error.Gi;
647                         substitutionMatrix[gap][c] += error.weight * error.Ci;
648                         substitutionMatrix[gap][n] += error.weight * error.Ni;
649                 }
650         }
651         catch(exception& e) {
652                 m->errorOut(e, "SeqErrorCommand", "printErrorData");
653                 exit(1);
654         }
655 }
656
657 //***************************************************************************************************************
658
659 void SeqErrorCommand::printSubMatrix(){
660         try {
661                 string subMatrixFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.matrix";
662                 ofstream subMatrixFile;
663                 m->openOutputFile(subMatrixFileName, subMatrixFile);
664                 outputNames.push_back(subMatrixFileName);  outputTypes["error.matrix"].push_back(subMatrixFileName);
665                 vector<string> bases(6);
666                 bases[0] = "A";
667                 bases[1] = "T";
668                 bases[2] = "G";
669                 bases[3] = "C";
670                 bases[4] = "Gap";
671                 bases[5] = "N";
672                 vector<int> refSums(5,1);
673
674                 for(int i=0;i<5;i++){
675                         subMatrixFile << "\tr" << bases[i];
676                         
677                         for(int j=0;j<6;j++){
678                                 refSums[i] += substitutionMatrix[i][j];                         
679                         }
680                 }
681                 subMatrixFile << endl;
682                 
683                 for(int i=0;i<6;i++){
684                         subMatrixFile << 'q' << bases[i];
685                         for(int j=0;j<5;j++){
686                                 subMatrixFile << '\t' << substitutionMatrix[j][i];                              
687                         }
688                         subMatrixFile << endl;
689                 }
690
691                 subMatrixFile << "total";
692                 for(int i=0;i<5;i++){
693                         subMatrixFile << '\t' << refSums[i];
694                 }
695                 subMatrixFile << endl;
696                 subMatrixFile.close();
697         }
698         catch(exception& e) {
699                 m->errorOut(e, "SeqErrorCommand", "printSubMatrix");
700                 exit(1);
701         }
702 }
703 //***************************************************************************************************************
704
705 void SeqErrorCommand::printErrorFRFile(map<char, vector<int> > errorForward, map<char, vector<int> > errorReverse){
706         try{
707                 string errorForwardFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.forward";
708                 ofstream errorForwardFile;
709                 m->openOutputFile(errorForwardFileName, errorForwardFile);
710                 outputNames.push_back(errorForwardFileName);  outputTypes["error.forward"].push_back(errorForwardFileName);
711
712                 errorForwardFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
713                 for(int i=0;i<maxLength;i++){
714                         float match = (float)errorForward['m'][i];
715                         float subst = (float)errorForward['s'][i];
716                         float insert = (float)errorForward['i'][i];
717                         float del = (float)errorForward['d'][i];
718                         float amb = (float)errorForward['a'][i];
719                         float total = match + subst + insert + del + amb;
720                         if(total == 0){ break;  }
721                         errorForwardFile << i+1 << '\t' << total << '\t' << match/total  << '\t' << subst/total  << '\t' << insert/total  << '\t' << del/total  << '\t' << amb/total << endl;
722                 }
723                 errorForwardFile.close();
724
725                 string errorReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.reverse";
726                 ofstream errorReverseFile;
727                 m->openOutputFile(errorReverseFileName, errorReverseFile);
728                 outputNames.push_back(errorReverseFileName);  outputTypes["error.reverse"].push_back(errorReverseFileName);
729
730                 errorReverseFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
731                 for(int i=0;i<maxLength;i++){
732                         float match = (float)errorReverse['m'][i];
733                         float subst = (float)errorReverse['s'][i];
734                         float insert = (float)errorReverse['i'][i];
735                         float del = (float)errorReverse['d'][i];
736                         float amb = (float)errorReverse['a'][i];
737                         float total = match + subst + insert + del + amb;
738                         if(total == 0){ break;  }
739                         errorReverseFile << i+1 << '\t' << total << '\t' << match/total  << '\t' << subst/total  << '\t' << insert/total  << '\t' << del/total  << '\t' << amb/total << endl;
740                 }
741                 errorReverseFile.close();
742         }
743         catch(exception& e) {
744                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
745                 exit(1);
746         }
747 }
748
749 //***************************************************************************************************************
750
751 void SeqErrorCommand::printErrorQuality(map<char, vector<int> > qScoreErrorMap){
752         try{
753
754                 string errorQualityFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.quality";
755                 ofstream errorQualityFile;
756                 m->openOutputFile(errorQualityFileName, errorQualityFile);
757                 outputNames.push_back(errorQualityFileName);  outputTypes["error.quality"].push_back(errorQualityFileName);
758
759                 errorQualityFile << "qscore\tmatches\tsubstitutions\tinsertions\tambiguous" << endl;
760                 for(int i=0;i<41;i++){
761                         errorQualityFile << i << '\t' << qScoreErrorMap['m'][i] << '\t' << qScoreErrorMap['s'][i] << '\t' << qScoreErrorMap['i'][i] << '\t'<< qScoreErrorMap['a'][i] << endl;
762                 }
763                 errorQualityFile.close();
764         }
765         catch(exception& e) {
766                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
767                 exit(1);
768         }
769 }
770
771
772 //***************************************************************************************************************
773
774 void SeqErrorCommand::printQualityFR(vector<vector<int> > qualForwardMap, vector<vector<int> > qualReverseMap){
775
776         try{
777                 int numRows = 0;
778                 int numColumns = qualForwardMap[0].size();
779
780                 for(int i=0;i<qualForwardMap.size();i++){
781                         for(int j=0;j<numColumns;j++){
782                                 if(qualForwardMap[i][j] != 0){
783                                         if(numRows < i)         {       numRows = i+20;         }
784                                 }
785                         }
786                 }
787
788                 string qualityForwardFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.qual.forward";
789                 ofstream qualityForwardFile;
790                 m->openOutputFile(qualityForwardFileName, qualityForwardFile);
791                 outputNames.push_back(qualityForwardFileName);  outputTypes["error.qual.forward"].push_back(qualityForwardFileName);
792
793                 for(int i=0;i<numColumns;i++){  qualityForwardFile << '\t' << i;        }       qualityForwardFile << endl;
794
795                 for(int i=0;i<numRows;i++){
796                         qualityForwardFile << i+1;
797                         for(int j=0;j<numColumns;j++){
798                                 qualityForwardFile << '\t' << qualForwardMap[i][j];
799                         }
800
801                         qualityForwardFile << endl;
802                 }
803                 qualityForwardFile.close();
804
805                 
806                 string qualityReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.qual.reverse";
807                 ofstream qualityReverseFile;
808                 m->openOutputFile(qualityReverseFileName, qualityReverseFile);
809                 outputNames.push_back(qualityReverseFileName);  outputTypes["error.qual.reverse"].push_back(qualityReverseFileName);
810                 
811                 for(int i=0;i<numColumns;i++){  qualityReverseFile << '\t' << i;        }       qualityReverseFile << endl;
812                 for(int i=0;i<numRows;i++){
813                         
814                         qualityReverseFile << i+1;
815                         for(int j=0;j<numColumns;j++){
816                                 qualityReverseFile << '\t' << qualReverseMap[i][j];
817                         }
818                         qualityReverseFile << endl;
819                 }
820                 qualityReverseFile.close();
821         }
822         catch(exception& e) {
823                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
824                 exit(1);
825         }
826         
827 }
828
829 //***************************************************************************************************************