]> git.donarmstrong.com Git - mothur.git/blob - seqerrorcommand.cpp
added getCommandInfoCommand for gui
[mothur.git] / seqerrorcommand.cpp
1 /*
2  *  seqerrorcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 7/15/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "seqerrorcommand.h"
11 #include "reportfile.h"
12 #include "qualityscores.h"
13 #include "refchimeratest.h"
14
15 //**********************************************************************************************************************
16 vector<string> SeqErrorCommand::setParameters(){        
17         try {
18                 CommandParameter pquery("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pquery);
19                 CommandParameter preference("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(preference);
20                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(pqfile);
21                 CommandParameter preport("report", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(preport);
22                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
23                 CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pignorechimeras);
24                 CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pthreshold);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "SeqErrorCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string SeqErrorCommand::getHelpString(){        
39         try {
40                 string helpString = "";
41                 helpString += "The seq.error command reads a query alignment file and a reference alignment file and creates .....\n";
42                 helpString += "Example seq.error(...).\n";
43                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
44                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n";
45                 return helpString;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "SeqErrorCommand", "getHelpString");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 SeqErrorCommand::SeqErrorCommand(){     
54         try {
55                 abort = true; calledHelp = true; 
56                 vector<string> tempOutNames;
57                 outputTypes["error.summary"] = tempOutNames;
58                 outputTypes["error.seq"] = tempOutNames;
59                 outputTypes["error.quality"] = tempOutNames;
60                 outputTypes["error.qual.forward"] = tempOutNames;
61                 outputTypes["error.qual.reverse"] = tempOutNames;
62                 outputTypes["error.forward"] = tempOutNames;
63                 outputTypes["error.reverse"] = tempOutNames;
64                 outputTypes["error.count"] = tempOutNames;
65                 outputTypes["error.matrix"] = tempOutNames;
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
69                 exit(1);
70         }
71 }
72 //***************************************************************************************************************
73
74 SeqErrorCommand::SeqErrorCommand(string option)  {
75         try {
76                 
77                 abort = false; calledHelp = false;   
78                 
79                 //allow user to run help
80                 if(option == "help") { help(); abort = true; calledHelp = true; }
81                 
82                 else {
83                         string temp;
84                         vector<string> myArray = setParameters();
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter;
90                         map<string,string>::iterator it;
91                         
92                         //check to make sure all parameters are valid for command
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
95                         }
96                         
97                         //initialize outputTypes
98                         vector<string> tempOutNames;
99                         outputTypes["error.summary"] = tempOutNames;
100                         outputTypes["error.seq"] = tempOutNames;
101                         outputTypes["error.quality"] = tempOutNames;
102                         outputTypes["error.qual.forward"] = tempOutNames;
103                         outputTypes["error.qual.reverse"] = tempOutNames;
104                         outputTypes["error.forward"] = tempOutNames;
105                         outputTypes["error.reverse"] = tempOutNames;
106                         outputTypes["error.count"] = tempOutNames;
107                         outputTypes["error.matrix"] = tempOutNames;
108
109                         
110                         //if the user changes the input directory command factory will send this info to us in the output parameter 
111                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
112                         if (inputDir == "not found"){   inputDir = "";          }
113                         else {
114                                 string path;
115                                 it = parameters.find("fasta");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
121                                 }
122                                 
123                                 it = parameters.find("reference");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
129                                 }
130                                 
131                                 it = parameters.find("name");
132                                 //user has given a names file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
137                                 }
138
139                                 it = parameters.find("qfile");
140                                 //user has given a quality score file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
145                                 }
146                                 
147                                 it = parameters.find("report");
148                                 //user has given a alignment report file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["report"] = inputDir + it->second;           }
153                                 }
154                                 
155                         }
156                         //check for required parameters
157                         queryFileName = validParameter.validFile(parameters, "fasta", true);
158                         if (queryFileName == "not found") { 
159                                 queryFileName = m->getFastaFile(); 
160                                 if (queryFileName != "") { m->mothurOut("Using " + queryFileName + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
161                                 else {  m->mothurOut("You have no current fasta file and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
162                         }
163                         else if (queryFileName == "not open") { abort = true; } 
164                         
165                         referenceFileName = validParameter.validFile(parameters, "reference", true);
166                         if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
167                         else if (referenceFileName == "not open") { abort = true; }     
168                         
169
170                         //check for optional parameters
171                         namesFileName = validParameter.validFile(parameters, "name", true);
172                         if(namesFileName == "not found"){       namesFileName = "";     }
173                         else if (namesFileName == "not open") { namesFileName = ""; abort = true; }     
174                         
175                         qualFileName = validParameter.validFile(parameters, "qfile", true);
176                         if(qualFileName == "not found"){        qualFileName = "";      }
177                         else if (qualFileName == "not open") { qualFileName = ""; abort = true; }       
178
179                         reportFileName = validParameter.validFile(parameters, "report", true);
180                         if(reportFileName == "not found"){      reportFileName = "";    }
181                         else if (reportFileName == "not open") { reportFileName = ""; abort = true; }   
182                         
183                         if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){
184                                 m->mothurOut("if you use either a qual file or a report file, you have to have both.");
185                                 m->mothurOutEndLine();
186                                 abort = true; 
187                         }
188                         
189                         outputDir = validParameter.validFile(parameters, "outputdir", false);
190                         if (outputDir == "not found"){  
191                                 outputDir = ""; 
192                                 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it   
193                         }
194                         
195                         //check for optional parameter and set defaults
196                         // ...at some point should added some additional type checking...
197                         temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found") { temp = "1.00"; }
198                         convert(temp, threshold);  
199                         
200                         temp = validParameter.validFile(parameters, "ignorechimeras", false);   if (temp == "not found") { temp = "1"; }
201                         convert(temp, ignoreChimeras);  
202
203                         substitutionMatrix.resize(6);
204                         for(int i=0;i<6;i++){   substitutionMatrix[i].resize(6,0);      }
205                 }
206         }
207         catch(exception& e) {
208                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
209                 exit(1);
210         }
211 }
212 //***************************************************************************************************************
213
214 int SeqErrorCommand::execute(){
215         try{
216                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
217
218                 maxLength = 2000;
219                 
220                 string errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary";
221                 m->openOutputFile(errorSummaryFileName, errorSummaryFile);
222                 outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName);
223                 printErrorHeader();
224                 
225                 string errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq";
226                 m->openOutputFile(errorSeqFileName, errorSeqFile);
227                 outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName);
228
229                 getReferences();        //read in reference sequences - make sure there's no ambiguous bases
230
231                 map<string, int> weights;
232                 if(namesFileName != ""){        weights = getWeights(); }
233                 
234                 ifstream queryFile;
235                 m->openInputFile(queryFileName, queryFile);
236                 
237                 ifstream reportFile;
238                 ifstream qualFile;
239
240                 ReportFile report;
241                 QualityScores quality;
242                 vector<vector<int> > qualForwardMap;
243                 vector<vector<int> > qualReverseMap;
244                 
245                 if(qualFileName != "" && reportFileName != ""){
246                         m->openInputFile(qualFileName, qualFile);
247                         report = ReportFile(reportFile, reportFileName);
248                         
249                         qualForwardMap.resize(maxLength);
250                         qualReverseMap.resize(maxLength);
251                         for(int i=0;i<maxLength;i++){
252                                 qualForwardMap[i].assign(41,0);
253                                 qualReverseMap[i].assign(41,0);
254                         }                               
255                 }
256                 
257                 int totalBases = 0;
258                 int totalMatches = 0;
259                 
260                 vector<int> misMatchCounts(11, 0);
261                 int maxMismatch = 0;
262                 int numSeqs = 0;
263                 
264                 map<string, int>::iterator it;
265                 map<char, vector<int> > qScoreErrorMap;
266                 qScoreErrorMap['m'].assign(41, 0);
267                 qScoreErrorMap['s'].assign(41, 0);
268                 qScoreErrorMap['i'].assign(41, 0);
269                 qScoreErrorMap['a'].assign(41, 0);
270                 
271                 map<char, vector<int> > errorForward;
272                 errorForward['m'].assign(maxLength,0);
273                 errorForward['s'].assign(maxLength,0);
274                 errorForward['i'].assign(maxLength,0);
275                 errorForward['d'].assign(maxLength,0);
276                 errorForward['a'].assign(maxLength,0);
277                 
278                 map<char, vector<int> > errorReverse;
279                 errorReverse['m'].assign(maxLength,0);
280                 errorReverse['s'].assign(maxLength,0);
281                 errorReverse['i'].assign(maxLength,0);
282                 errorReverse['d'].assign(maxLength,0);
283                 errorReverse['a'].assign(maxLength,0);  
284                 
285
286                 string errorChimeraFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.chimera";
287                 RefChimeraTest chimeraTest(referenceSeqs, errorChimeraFileName);
288                 outputNames.push_back(errorChimeraFileName); outputTypes["error.chimera"].push_back(errorChimeraFileName);
289                 
290                 vector<string> megaAlignVector(numRefs, "");
291
292                 int index = 0;
293                 bool ignoreSeq = 0;
294                 
295                 while(queryFile){
296
297                         if (m->control_pressed) { errorSummaryFile.close();     errorSeqFile.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
298                 
299                         Sequence query(queryFile);
300                         
301                         int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned());
302                         int closestRefIndex = chimeraTest.getClosestRefIndex();
303
304                         if(numParentSeqs > 1 && ignoreChimeras == 1)    {       ignoreSeq = 1;  }
305                         else                                                                                    {       ignoreSeq = 0;  }
306
307                         Compare minCompare = getErrors(query, referenceSeqs[closestRefIndex]);
308                         
309                         if(namesFileName != ""){
310                                 it = weights.find(query.getName());
311                                 minCompare.weight = it->second;
312                         }
313                         else{   minCompare.weight = 1;  }
314
315                         printErrorData(minCompare, numParentSeqs);
316
317                         if(!ignoreSeq){
318                                 
319                                 for(int i=0;i<minCompare.total;i++){
320                                         char letter = minCompare.sequence[i];
321
322                                         errorForward[letter][i] += minCompare.weight;
323                                         errorReverse[letter][minCompare.total-i-1] += minCompare.weight;                                
324                                 }
325                         }
326
327                         if(qualFileName != "" && reportFileName != ""){
328                                 report = ReportFile(reportFile);
329                                 
330 //                              int origLength = report.getQueryLength();
331                                 int startBase = report.getQueryStart();
332                                 int endBase = report.getQueryEnd();
333
334                                 quality = QualityScores(qualFile);
335
336                                 if(!ignoreSeq){
337                                         quality.updateQScoreErrorMap(qScoreErrorMap, minCompare.sequence, startBase, endBase, minCompare.weight);
338                                         quality.updateForwardMap(qualForwardMap, startBase, endBase, minCompare.weight);
339                                         quality.updateReverseMap(qualReverseMap, startBase, endBase, minCompare.weight);
340                                 }
341                         }                       
342
343                         if(minCompare.errorRate < threshold && !ignoreSeq){
344                                 totalBases += (minCompare.total * minCompare.weight);
345                                 totalMatches += minCompare.matches * minCompare.weight;
346                                 if(minCompare.mismatches > maxMismatch){
347                                         maxMismatch = minCompare.mismatches;
348                                         misMatchCounts.resize(maxMismatch + 1, 0);
349                                 }                               
350                                 misMatchCounts[minCompare.mismatches] += minCompare.weight;
351                                 numSeqs++;
352                                 
353                                 megaAlignVector[closestRefIndex] += query.getInlineSeq() + '\n';
354                         }
355
356                         index++;
357                         
358                         if(index % 1000 == 0){  m->mothurOut(toString(index) + '\n');   }
359                 }
360                 queryFile.close();
361                 errorSummaryFile.close();       
362                 errorSeqFile.close();
363
364                 if(qualFileName != "" && reportFileName != ""){         
365                         printErrorQuality(qScoreErrorMap);
366                         printQualityFR(qualForwardMap, qualReverseMap);
367                 }
368                 
369                 printErrorFRFile(errorForward, errorReverse);
370                 
371                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
372
373                 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count";
374                 ofstream errorCountFile;
375                 m->openOutputFile(errorCountFileName, errorCountFile);
376                 outputNames.push_back(errorCountFileName);  outputTypes["error.count"].push_back(errorCountFileName);
377                 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n");
378                 m->mothurOut("Errors\tSequences\n");
379                 errorCountFile << "Errors\tSequences\n";                
380                 for(int i=0;i<misMatchCounts.size();i++){
381                         m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
382                         errorCountFile << i << '\t' << misMatchCounts[i] << endl;
383                 }
384                 errorCountFile.close();
385                 
386                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
387
388                 printSubMatrix();
389                                 
390                 string megAlignmentFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.ref-query";
391                 ofstream megAlignmentFile;
392                 m->openOutputFile(megAlignmentFileName, megAlignmentFile);
393                 outputNames.push_back(megAlignmentFileName);  outputTypes["error.ref-query"].push_back(megAlignmentFileName);
394                 
395                 for(int i=0;i<numRefs;i++){
396                         megAlignmentFile << referenceSeqs[i].getInlineSeq() << endl;
397                         megAlignmentFile << megaAlignVector[i] << endl;
398                 }
399                 
400                 
401                 m->mothurOutEndLine();
402                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
403                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
404                 m->mothurOutEndLine();
405                 
406                 return 0;       
407         }
408         catch(exception& e) {
409                 m->errorOut(e, "SeqErrorCommand", "execute");
410                 exit(1);
411         }
412 }
413
414 //***************************************************************************************************************
415
416 void SeqErrorCommand::getReferences(){
417         try {
418                 
419                 ifstream referenceFile;
420                 m->openInputFile(referenceFileName, referenceFile);
421                 
422                 int numAmbigSeqs = 0;
423                 
424                 int maxStartPos = 0;
425                 int minEndPos = 100000;
426                 
427                 while(referenceFile){
428                         Sequence currentSeq(referenceFile);
429                         int numAmbigs = currentSeq.getAmbigBases();
430                         if(numAmbigs > 0){      numAmbigSeqs++; }
431                         
432                         int startPos = currentSeq.getStartPos();
433                         if(startPos > maxStartPos)      {       maxStartPos = startPos; }
434
435                         int endPos = currentSeq.getEndPos();
436                         if(endPos < minEndPos)          {       minEndPos = endPos;             }
437                         referenceSeqs.push_back(currentSeq);
438                         m->gobble(referenceFile);
439                 }
440                 referenceFile.close();
441                 numRefs = referenceSeqs.size();
442
443                 
444                 for(int i=0;i<numRefs;i++){
445                         referenceSeqs[i].padToPos(maxStartPos);
446                         referenceSeqs[i].padFromPos(minEndPos);
447                 }
448                 
449                 if(numAmbigSeqs != 0){
450                         m->mothurOut("Warning: " + toString(numAmbigSeqs) + " reference sequences have ambiguous bases, these bases will be ignored\n");
451                 }               
452                 
453         }
454         catch(exception& e) {
455                 m->errorOut(e, "SeqErrorCommand", "getReferences");
456                 exit(1);
457         }
458 }
459
460 //***************************************************************************************************************
461
462 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
463         try {
464                 if(query.getAlignLength() != reference.getAlignLength()){
465                         m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
466                 }
467                 int alignLength = query.getAlignLength();
468         
469                 string q = query.getAligned();
470                 string r = reference.getAligned();
471
472                 int started = 0;
473                 Compare errors;
474
475                 for(int i=0;i<alignLength;i++){
476                         if(r[i] != 'N' && q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                  //      no missing data and no double gaps
477                                 started = 1;
478                                 
479                                 if(q[i] == 'A'){
480                                         if(r[i] == 'A'){        errors.AA++;    errors.matches++;       errors.sequence += 'm'; }
481                                         if(r[i] == 'T'){        errors.AT++;    errors.sequence += 's'; }
482                                         if(r[i] == 'G'){        errors.AG++;    errors.sequence += 's'; }
483                                         if(r[i] == 'C'){        errors.AC++;    errors.sequence += 's'; }
484                                         if(r[i] == '-'){        errors.Ai++;    errors.sequence += 'i'; }
485                                 }
486                                 else if(q[i] == 'T'){
487                                         if(r[i] == 'A'){        errors.TA++;    errors.sequence += 's'; }
488                                         if(r[i] == 'T'){        errors.TT++;    errors.matches++;       errors.sequence += 'm'; }
489                                         if(r[i] == 'G'){        errors.TG++;    errors.sequence += 's'; }
490                                         if(r[i] == 'C'){        errors.TC++;    errors.sequence += 's'; }
491                                         if(r[i] == '-'){        errors.Ti++;    errors.sequence += 'i'; }
492                                 }
493                                 else if(q[i] == 'G'){
494                                         if(r[i] == 'A'){        errors.GA++;    errors.sequence += 's'; }
495                                         if(r[i] == 'T'){        errors.GT++;    errors.sequence += 's'; }
496                                         if(r[i] == 'G'){        errors.GG++;    errors.matches++;       errors.sequence += 'm'; }
497                                         if(r[i] == 'C'){        errors.GC++;    errors.sequence += 's'; }
498                                         if(r[i] == '-'){        errors.Gi++;    errors.sequence += 'i'; }
499                                 }
500                                 else if(q[i] == 'C'){
501                                         if(r[i] == 'A'){        errors.CA++;    errors.sequence += 's'; }
502                                         if(r[i] == 'T'){        errors.CT++;    errors.sequence += 's'; }
503                                         if(r[i] == 'G'){        errors.CG++;    errors.sequence += 's'; }
504                                         if(r[i] == 'C'){        errors.CC++;    errors.matches++;       errors.sequence += 'm'; }
505                                         if(r[i] == '-'){        errors.Ci++;    errors.sequence += 'i'; }
506                                 }
507                                 else if(q[i] == 'N'){
508                                         if(r[i] == 'A'){        errors.NA++;    errors.sequence += 'a'; }
509                                         if(r[i] == 'T'){        errors.NT++;    errors.sequence += 'a'; }
510                                         if(r[i] == 'G'){        errors.NG++;    errors.sequence += 'a'; }
511                                         if(r[i] == 'C'){        errors.NC++;    errors.sequence += 'a'; }
512                                         if(r[i] == '-'){        errors.Ni++;    errors.sequence += 'a'; }
513                                 }
514                                 else if(q[i] == '-' && r[i] != '-'){
515                                         if(r[i] == 'A'){        errors.dA++;    errors.sequence += 'd'; }
516                                         if(r[i] == 'T'){        errors.dT++;    errors.sequence += 'd'; }
517                                         if(r[i] == 'G'){        errors.dG++;    errors.sequence += 'd'; }
518                                         if(r[i] == 'C'){        errors.dC++;    errors.sequence += 'd'; }
519                                 }
520                                 errors.total++; 
521                                 
522                         }
523                         else if(q[i] == '.' && r[i] != '.'){            //      reference extends beyond query
524                                 if(started == 1){       break;  }
525                         }
526                         else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
527                                 if(started == 1){       break;  }
528                         }
529                         else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
530                                 if(started == 1){       break;  }                       
531                         }
532                         
533                 }
534                 errors.mismatches = errors.total-errors.matches;
535                 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
536                 errors.queryName = query.getName();
537                 errors.refName = reference.getName();
538                 
539                 return errors;
540         }
541         catch(exception& e) {
542                 m->errorOut(e, "SeqErrorCommand", "getErrors");
543                 exit(1);
544         }
545 }
546
547 //***************************************************************************************************************
548
549 map<string, int> SeqErrorCommand::getWeights(){
550         ifstream nameFile;
551         m->openInputFile(namesFileName, nameFile);
552         
553         string seqName;
554         string redundantSeqs;
555         map<string, int> nameCountMap;
556         
557         while(nameFile){
558                 nameFile >> seqName >> redundantSeqs;
559                 nameCountMap[seqName] = m->getNumNames(redundantSeqs); 
560                 m->gobble(nameFile);
561         }
562         return nameCountMap;
563 }
564
565
566 //***************************************************************************************************************
567
568 void SeqErrorCommand::printErrorHeader(){
569         try {
570                 errorSummaryFile << "query\treference\tweight\t";
571                 errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
572                 errorSummaryFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\tnumparents\n";
573                 
574                 errorSummaryFile << setprecision(6);
575                 errorSummaryFile.setf(ios::fixed);
576         }
577         catch(exception& e) {
578                 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
579                 exit(1);
580         }
581 }
582
583 //***************************************************************************************************************
584
585 void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs){
586         try {
587
588                 errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
589                 errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
590                 errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
591                 errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
592                 errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
593                 errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
594                 errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t';
595                 errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
596                 
597                 errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                  //insertions
598                 errorSummaryFile << error.dA + error.dT + error.dG + error.dC << '\t';                  //deletions
599                 errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';      //substitutions
600                 errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';       //ambiguities
601                 errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << '\t' << numParentSeqs << endl;
602
603                 errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl;
604                 
605                 int a=0;                int t=1;                int g=2;                int c=3;
606                 int gap=4;              int n=5;
607
608                 if(numParentSeqs == 1 || ignoreChimeras == 0){
609                         substitutionMatrix[a][a] += error.weight * error.AA;
610                         substitutionMatrix[a][t] += error.weight * error.TA;
611                         substitutionMatrix[a][g] += error.weight * error.GA;
612                         substitutionMatrix[a][c] += error.weight * error.CA;
613                         substitutionMatrix[a][gap] += error.weight * error.dA;
614                         substitutionMatrix[a][n] += error.weight * error.NA;
615                         
616                         substitutionMatrix[t][a] += error.weight * error.AT;
617                         substitutionMatrix[t][t] += error.weight * error.TT;
618                         substitutionMatrix[t][g] += error.weight * error.GT;
619                         substitutionMatrix[t][c] += error.weight * error.CT;
620                         substitutionMatrix[t][gap] += error.weight * error.dT;
621                         substitutionMatrix[t][n] += error.weight * error.NT;
622
623                         substitutionMatrix[g][a] += error.weight * error.AG;
624                         substitutionMatrix[g][t] += error.weight * error.TG;
625                         substitutionMatrix[g][g] += error.weight * error.GG;
626                         substitutionMatrix[g][c] += error.weight * error.CG;
627                         substitutionMatrix[g][gap] += error.weight * error.dG;
628                         substitutionMatrix[g][n] += error.weight * error.NG;
629
630                         substitutionMatrix[c][a] += error.weight * error.AC;
631                         substitutionMatrix[c][t] += error.weight * error.TC;
632                         substitutionMatrix[c][g] += error.weight * error.GC;
633                         substitutionMatrix[c][c] += error.weight * error.CC;
634                         substitutionMatrix[c][gap] += error.weight * error.dC;
635                         substitutionMatrix[c][n] += error.weight * error.NC;
636
637                         substitutionMatrix[gap][a] += error.weight * error.Ai;
638                         substitutionMatrix[gap][t] += error.weight * error.Ti;
639                         substitutionMatrix[gap][g] += error.weight * error.Gi;
640                         substitutionMatrix[gap][c] += error.weight * error.Ci;
641                         substitutionMatrix[gap][n] += error.weight * error.Ni;
642                 }
643         }
644         catch(exception& e) {
645                 m->errorOut(e, "SeqErrorCommand", "printErrorData");
646                 exit(1);
647         }
648 }
649
650 //***************************************************************************************************************
651
652 void SeqErrorCommand::printSubMatrix(){
653         try {
654                 string subMatrixFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.matrix";
655                 ofstream subMatrixFile;
656                 m->openOutputFile(subMatrixFileName, subMatrixFile);
657                 outputNames.push_back(subMatrixFileName);  outputTypes["error.matrix"].push_back(subMatrixFileName);
658                 vector<string> bases(6);
659                 bases[0] = "A";
660                 bases[1] = "T";
661                 bases[2] = "G";
662                 bases[3] = "C";
663                 bases[4] = "Gap";
664                 bases[5] = "N";
665                 vector<int> refSums(5,1);
666
667                 for(int i=0;i<5;i++){
668                         subMatrixFile << "\tr" << bases[i];
669                         
670                         for(int j=0;j<6;j++){
671                                 refSums[i] += substitutionMatrix[i][j];                         
672                         }
673                 }
674                 subMatrixFile << endl;
675                 
676                 for(int i=0;i<6;i++){
677                         subMatrixFile << 'q' << bases[i];
678                         for(int j=0;j<5;j++){
679                                 subMatrixFile << '\t' << substitutionMatrix[j][i];                              
680                         }
681                         subMatrixFile << endl;
682                 }
683
684                 subMatrixFile << "total";
685                 for(int i=0;i<5;i++){
686                         subMatrixFile << '\t' << refSums[i];
687                 }
688                 subMatrixFile << endl;
689                 subMatrixFile.close();
690         }
691         catch(exception& e) {
692                 m->errorOut(e, "SeqErrorCommand", "printSubMatrix");
693                 exit(1);
694         }
695 }
696 //***************************************************************************************************************
697
698 void SeqErrorCommand::printErrorFRFile(map<char, vector<int> > errorForward, map<char, vector<int> > errorReverse){
699         try{
700                 string errorForwardFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.forward";
701                 ofstream errorForwardFile;
702                 m->openOutputFile(errorForwardFileName, errorForwardFile);
703                 outputNames.push_back(errorForwardFileName);  outputTypes["error.forward"].push_back(errorForwardFileName);
704
705                 errorForwardFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
706                 for(int i=0;i<maxLength;i++){
707                         float match = (float)errorForward['m'][i];
708                         float subst = (float)errorForward['s'][i];
709                         float insert = (float)errorForward['i'][i];
710                         float del = (float)errorForward['d'][i];
711                         float amb = (float)errorForward['a'][i];
712                         float total = match + subst + insert + del + amb;
713                         if(total == 0){ break;  }
714                         errorForwardFile << i+1 << '\t' << total << '\t' << match/total  << '\t' << subst/total  << '\t' << insert/total  << '\t' << del/total  << '\t' << amb/total << endl;
715                 }
716                 errorForwardFile.close();
717
718                 string errorReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.reverse";
719                 ofstream errorReverseFile;
720                 m->openOutputFile(errorReverseFileName, errorReverseFile);
721                 outputNames.push_back(errorReverseFileName);  outputTypes["error.reverse"].push_back(errorReverseFileName);
722
723                 errorReverseFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
724                 for(int i=0;i<maxLength;i++){
725                         float match = (float)errorReverse['m'][i];
726                         float subst = (float)errorReverse['s'][i];
727                         float insert = (float)errorReverse['i'][i];
728                         float del = (float)errorReverse['d'][i];
729                         float amb = (float)errorReverse['a'][i];
730                         float total = match + subst + insert + del + amb;
731                         if(total == 0){ break;  }
732                         errorReverseFile << i+1 << '\t' << total << '\t' << match/total  << '\t' << subst/total  << '\t' << insert/total  << '\t' << del/total  << '\t' << amb/total << endl;
733                 }
734                 errorReverseFile.close();
735         }
736         catch(exception& e) {
737                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
738                 exit(1);
739         }
740 }
741
742 //***************************************************************************************************************
743
744 void SeqErrorCommand::printErrorQuality(map<char, vector<int> > qScoreErrorMap){
745         try{
746
747                 string errorQualityFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.quality";
748                 ofstream errorQualityFile;
749                 m->openOutputFile(errorQualityFileName, errorQualityFile);
750                 outputNames.push_back(errorQualityFileName);  outputTypes["error.quality"].push_back(errorQualityFileName);
751
752                 errorQualityFile << "qscore\tmatches\tsubstitutions\tinsertions\tambiguous" << endl;
753                 for(int i=0;i<41;i++){
754                         errorQualityFile << i << '\t' << qScoreErrorMap['m'][i] << '\t' << qScoreErrorMap['s'][i] << '\t' << qScoreErrorMap['i'][i] << '\t'<< qScoreErrorMap['a'][i] << endl;
755                 }
756                 errorQualityFile.close();
757         }
758         catch(exception& e) {
759                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
760                 exit(1);
761         }
762 }
763
764
765 //***************************************************************************************************************
766
767 void SeqErrorCommand::printQualityFR(vector<vector<int> > qualForwardMap, vector<vector<int> > qualReverseMap){
768
769         try{
770                 int numRows = 0;
771                 int numColumns = qualForwardMap[0].size();
772
773                 for(int i=0;i<qualForwardMap.size();i++){
774                         for(int j=0;j<numColumns;j++){
775                                 if(qualForwardMap[i][j] != 0){
776                                         if(numRows < i)         {       numRows = i+20;         }
777                                 }
778                         }
779                 }
780
781                 string qualityForwardFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.qual.forward";
782                 ofstream qualityForwardFile;
783                 m->openOutputFile(qualityForwardFileName, qualityForwardFile);
784                 outputNames.push_back(qualityForwardFileName);  outputTypes["error.qual.forward"].push_back(qualityForwardFileName);
785
786                 for(int i=0;i<numColumns;i++){  qualityForwardFile << '\t' << i;        }       qualityForwardFile << endl;
787
788                 for(int i=0;i<numRows;i++){
789                         qualityForwardFile << i+1;
790                         for(int j=0;j<numColumns;j++){
791                                 qualityForwardFile << '\t' << qualForwardMap[i][j];
792                         }
793
794                         qualityForwardFile << endl;
795                 }
796                 qualityForwardFile.close();
797
798                 
799                 string qualityReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.qual.reverse";
800                 ofstream qualityReverseFile;
801                 m->openOutputFile(qualityReverseFileName, qualityReverseFile);
802                 outputNames.push_back(qualityReverseFileName);  outputTypes["error.qual.reverse"].push_back(qualityReverseFileName);
803                 
804                 for(int i=0;i<numColumns;i++){  qualityReverseFile << '\t' << i;        }       qualityReverseFile << endl;
805                 for(int i=0;i<numRows;i++){
806                         
807                         qualityReverseFile << i+1;
808                         for(int j=0;j<numColumns;j++){
809                                 qualityReverseFile << '\t' << qualReverseMap[i][j];
810                         }
811                         qualityReverseFile << endl;
812                 }
813                 qualityReverseFile.close();
814         }
815         catch(exception& e) {
816                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
817                 exit(1);
818         }
819         
820 }
821
822 //***************************************************************************************************************