]> git.donarmstrong.com Git - mothur.git/blob - seqerrorcommand.cpp
added citation function to commands
[mothur.git] / seqerrorcommand.cpp
1 /*
2  *  seqerrorcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 7/15/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "seqerrorcommand.h"
11 #include "reportfile.h"
12 #include "qualityscores.h"
13 #include "refchimeratest.h"
14
15 //**********************************************************************************************************************
16 vector<string> SeqErrorCommand::setParameters(){        
17         try {
18                 CommandParameter pquery("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pquery);
19                 CommandParameter preference("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(preference);
20                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(pqfile);
21                 CommandParameter preport("report", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(preport);
22                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
23                 CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pignorechimeras);
24                 CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pthreshold);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "SeqErrorCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string SeqErrorCommand::getHelpString(){        
39         try {
40                 string helpString = "";
41                 helpString += "The seq.error command reads a query alignment file and a reference alignment file and creates .....\n";
42                 helpString += "Example seq.error(...).\n";
43                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
44                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n";
45                 return helpString;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "SeqErrorCommand", "getHelpString");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 SeqErrorCommand::SeqErrorCommand(){     
54         try {
55                 abort = true; calledHelp = true; 
56                 vector<string> tempOutNames;
57                 outputTypes["error.summary"] = tempOutNames;
58                 outputTypes["error.seq"] = tempOutNames;
59                 outputTypes["error.quality"] = tempOutNames;
60                 outputTypes["error.qual.forward"] = tempOutNames;
61                 outputTypes["error.qual.reverse"] = tempOutNames;
62                 outputTypes["error.forward"] = tempOutNames;
63                 outputTypes["error.reverse"] = tempOutNames;
64                 outputTypes["error.count"] = tempOutNames;
65                 outputTypes["error.matrix"] = tempOutNames;
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
69                 exit(1);
70         }
71 }
72 //***************************************************************************************************************
73
74 SeqErrorCommand::SeqErrorCommand(string option)  {
75         try {
76                 
77                 abort = false; calledHelp = false;   
78                 
79                 //allow user to run help
80                 if(option == "help") { help(); abort = true; calledHelp = true; }
81                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
82                 
83                 else {
84                         string temp;
85                         vector<string> myArray = setParameters();
86                         
87                         OptionParser parser(option);
88                         map<string,string> parameters = parser.getParameters();
89                         
90                         ValidParameters validParameter;
91                         map<string,string>::iterator it;
92                         
93                         //check to make sure all parameters are valid for command
94                         for (it = parameters.begin(); it != parameters.end(); it++) { 
95                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
96                         }
97                         
98                         //initialize outputTypes
99                         vector<string> tempOutNames;
100                         outputTypes["error.summary"] = tempOutNames;
101                         outputTypes["error.seq"] = tempOutNames;
102                         outputTypes["error.quality"] = tempOutNames;
103                         outputTypes["error.qual.forward"] = tempOutNames;
104                         outputTypes["error.qual.reverse"] = tempOutNames;
105                         outputTypes["error.forward"] = tempOutNames;
106                         outputTypes["error.reverse"] = tempOutNames;
107                         outputTypes["error.count"] = tempOutNames;
108                         outputTypes["error.matrix"] = tempOutNames;
109
110                         
111                         //if the user changes the input directory command factory will send this info to us in the output parameter 
112                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
113                         if (inputDir == "not found"){   inputDir = "";          }
114                         else {
115                                 string path;
116                                 it = parameters.find("fasta");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
122                                 }
123                                 
124                                 it = parameters.find("reference");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
130                                 }
131                                 
132                                 it = parameters.find("name");
133                                 //user has given a names file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
138                                 }
139
140                                 it = parameters.find("qfile");
141                                 //user has given a quality score file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
146                                 }
147                                 
148                                 it = parameters.find("report");
149                                 //user has given a alignment report file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["report"] = inputDir + it->second;           }
154                                 }
155                                 
156                         }
157                         //check for required parameters
158                         queryFileName = validParameter.validFile(parameters, "fasta", true);
159                         if (queryFileName == "not found") { 
160                                 queryFileName = m->getFastaFile(); 
161                                 if (queryFileName != "") { m->mothurOut("Using " + queryFileName + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
162                                 else {  m->mothurOut("You have no current fasta file and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
163                         }
164                         else if (queryFileName == "not open") { abort = true; } 
165                         
166                         referenceFileName = validParameter.validFile(parameters, "reference", true);
167                         if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
168                         else if (referenceFileName == "not open") { abort = true; }     
169                         
170
171                         //check for optional parameters
172                         namesFileName = validParameter.validFile(parameters, "name", true);
173                         if(namesFileName == "not found"){       namesFileName = "";     }
174                         else if (namesFileName == "not open") { namesFileName = ""; abort = true; }     
175                         
176                         qualFileName = validParameter.validFile(parameters, "qfile", true);
177                         if(qualFileName == "not found"){        qualFileName = "";      }
178                         else if (qualFileName == "not open") { qualFileName = ""; abort = true; }       
179
180                         reportFileName = validParameter.validFile(parameters, "report", true);
181                         if(reportFileName == "not found"){      reportFileName = "";    }
182                         else if (reportFileName == "not open") { reportFileName = ""; abort = true; }   
183                         
184                         if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){
185                                 m->mothurOut("if you use either a qual file or a report file, you have to have both.");
186                                 m->mothurOutEndLine();
187                                 abort = true; 
188                         }
189                         
190                         outputDir = validParameter.validFile(parameters, "outputdir", false);
191                         if (outputDir == "not found"){  
192                                 outputDir = ""; 
193                                 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it   
194                         }
195                         
196                         //check for optional parameter and set defaults
197                         // ...at some point should added some additional type checking...
198                         temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found") { temp = "1.00"; }
199                         convert(temp, threshold);  
200                         
201                         temp = validParameter.validFile(parameters, "ignorechimeras", false);   if (temp == "not found") { temp = "1"; }
202                         convert(temp, ignoreChimeras);  
203
204                         substitutionMatrix.resize(6);
205                         for(int i=0;i<6;i++){   substitutionMatrix[i].resize(6,0);      }
206                 }
207         }
208         catch(exception& e) {
209                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
210                 exit(1);
211         }
212 }
213 //***************************************************************************************************************
214
215 int SeqErrorCommand::execute(){
216         try{
217                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
218
219                 maxLength = 2000;
220                 
221                 string errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary";
222                 m->openOutputFile(errorSummaryFileName, errorSummaryFile);
223                 outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName);
224                 printErrorHeader();
225                 
226                 string errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq";
227                 m->openOutputFile(errorSeqFileName, errorSeqFile);
228                 outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName);
229
230                 getReferences();        //read in reference sequences - make sure there's no ambiguous bases
231
232                 map<string, int> weights;
233                 if(namesFileName != ""){        weights = getWeights(); }
234                 
235                 ifstream queryFile;
236                 m->openInputFile(queryFileName, queryFile);
237                 
238                 ifstream reportFile;
239                 ifstream qualFile;
240
241                 ReportFile report;
242                 QualityScores quality;
243                 vector<vector<int> > qualForwardMap;
244                 vector<vector<int> > qualReverseMap;
245                 
246                 if(qualFileName != "" && reportFileName != ""){
247                         m->openInputFile(qualFileName, qualFile);
248                         report = ReportFile(reportFile, reportFileName);
249                         
250                         qualForwardMap.resize(maxLength);
251                         qualReverseMap.resize(maxLength);
252                         for(int i=0;i<maxLength;i++){
253                                 qualForwardMap[i].assign(41,0);
254                                 qualReverseMap[i].assign(41,0);
255                         }                               
256                 }
257                 
258                 int totalBases = 0;
259                 int totalMatches = 0;
260                 
261                 vector<int> misMatchCounts(11, 0);
262                 int maxMismatch = 0;
263                 int numSeqs = 0;
264                 
265                 map<string, int>::iterator it;
266                 map<char, vector<int> > qScoreErrorMap;
267                 qScoreErrorMap['m'].assign(41, 0);
268                 qScoreErrorMap['s'].assign(41, 0);
269                 qScoreErrorMap['i'].assign(41, 0);
270                 qScoreErrorMap['a'].assign(41, 0);
271                 
272                 map<char, vector<int> > errorForward;
273                 errorForward['m'].assign(maxLength,0);
274                 errorForward['s'].assign(maxLength,0);
275                 errorForward['i'].assign(maxLength,0);
276                 errorForward['d'].assign(maxLength,0);
277                 errorForward['a'].assign(maxLength,0);
278                 
279                 map<char, vector<int> > errorReverse;
280                 errorReverse['m'].assign(maxLength,0);
281                 errorReverse['s'].assign(maxLength,0);
282                 errorReverse['i'].assign(maxLength,0);
283                 errorReverse['d'].assign(maxLength,0);
284                 errorReverse['a'].assign(maxLength,0);  
285                 
286
287                 string errorChimeraFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.chimera";
288                 RefChimeraTest chimeraTest(referenceSeqs, errorChimeraFileName);
289                 outputNames.push_back(errorChimeraFileName); outputTypes["error.chimera"].push_back(errorChimeraFileName);
290                 
291                 vector<string> megaAlignVector(numRefs, "");
292
293                 int index = 0;
294                 bool ignoreSeq = 0;
295                 
296                 while(queryFile){
297
298                         if (m->control_pressed) { errorSummaryFile.close();     errorSeqFile.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
299                 
300                         Sequence query(queryFile);
301                         
302                         int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned());
303                         int closestRefIndex = chimeraTest.getClosestRefIndex();
304
305                         if(numParentSeqs > 1 && ignoreChimeras == 1)    {       ignoreSeq = 1;  }
306                         else                                                                                    {       ignoreSeq = 0;  }
307
308                         Compare minCompare = getErrors(query, referenceSeqs[closestRefIndex]);
309                         
310                         if(namesFileName != ""){
311                                 it = weights.find(query.getName());
312                                 minCompare.weight = it->second;
313                         }
314                         else{   minCompare.weight = 1;  }
315
316                         printErrorData(minCompare, numParentSeqs);
317                 
318                         if(!ignoreSeq){
319
320                                 for(int i=0;i<minCompare.sequence.length();i++){
321                                         char letter = minCompare.sequence[i];
322
323                                         errorForward[letter][i] += minCompare.weight;
324                                         errorReverse[letter][minCompare.total-i-1] += minCompare.weight;                                
325                                 }
326                         }
327
328                         if(qualFileName != "" && reportFileName != ""){
329                                 report = ReportFile(reportFile);
330                                 
331 //                              int origLength = report.getQueryLength();
332                                 int startBase = report.getQueryStart();
333                                 int endBase = report.getQueryEnd();
334
335                                 quality = QualityScores(qualFile);
336
337                                 if(!ignoreSeq){
338                                         quality.updateQScoreErrorMap(qScoreErrorMap, minCompare.sequence, startBase, endBase, minCompare.weight);
339                                         quality.updateForwardMap(qualForwardMap, startBase, endBase, minCompare.weight);
340                                         quality.updateReverseMap(qualReverseMap, startBase, endBase, minCompare.weight);
341                                 }
342                         }                       
343
344                         if(minCompare.errorRate < threshold && !ignoreSeq){
345                                 totalBases += (minCompare.total * minCompare.weight);
346                                 totalMatches += minCompare.matches * minCompare.weight;
347                                 if(minCompare.mismatches > maxMismatch){
348                                         maxMismatch = minCompare.mismatches;
349                                         misMatchCounts.resize(maxMismatch + 1, 0);
350                                 }                               
351                                 misMatchCounts[minCompare.mismatches] += minCompare.weight;
352                                 numSeqs++;
353                                 
354                                 megaAlignVector[closestRefIndex] += query.getInlineSeq() + '\n';
355                         }
356
357                         index++;
358                         
359                         if(index % 1000 == 0){  m->mothurOut(toString(index) + '\n');   }
360                 }
361                 queryFile.close();
362                 errorSummaryFile.close();       
363                 errorSeqFile.close();
364
365                 if(qualFileName != "" && reportFileName != ""){         
366                         printErrorQuality(qScoreErrorMap);
367                         printQualityFR(qualForwardMap, qualReverseMap);
368                 }
369                 
370                 printErrorFRFile(errorForward, errorReverse);
371                 
372                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
373
374                 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count";
375                 ofstream errorCountFile;
376                 m->openOutputFile(errorCountFileName, errorCountFile);
377                 outputNames.push_back(errorCountFileName);  outputTypes["error.count"].push_back(errorCountFileName);
378                 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n");
379                 m->mothurOut("Errors\tSequences\n");
380                 errorCountFile << "Errors\tSequences\n";                
381                 for(int i=0;i<misMatchCounts.size();i++){
382                         m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
383                         errorCountFile << i << '\t' << misMatchCounts[i] << endl;
384                 }
385                 errorCountFile.close();
386                 
387                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
388
389                 printSubMatrix();
390                                 
391                 string megAlignmentFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.ref-query";
392                 ofstream megAlignmentFile;
393                 m->openOutputFile(megAlignmentFileName, megAlignmentFile);
394                 outputNames.push_back(megAlignmentFileName);  outputTypes["error.ref-query"].push_back(megAlignmentFileName);
395                 
396                 for(int i=0;i<numRefs;i++){
397                         megAlignmentFile << referenceSeqs[i].getInlineSeq() << endl;
398                         megAlignmentFile << megaAlignVector[i] << endl;
399                 }
400                 
401                 
402                 m->mothurOutEndLine();
403                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
404                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
405                 m->mothurOutEndLine();
406                 
407                 return 0;       
408         }
409         catch(exception& e) {
410                 m->errorOut(e, "SeqErrorCommand", "execute");
411                 exit(1);
412         }
413 }
414
415 //***************************************************************************************************************
416
417 void SeqErrorCommand::getReferences(){
418         try {
419                 
420                 ifstream referenceFile;
421                 m->openInputFile(referenceFileName, referenceFile);
422                 
423                 int numAmbigSeqs = 0;
424                 
425                 int maxStartPos = 0;
426                 int minEndPos = 100000;
427                 
428                 while(referenceFile){
429                         Sequence currentSeq(referenceFile);
430                         int numAmbigs = currentSeq.getAmbigBases();
431                         if(numAmbigs > 0){      numAmbigSeqs++; }
432                         
433 //                      int startPos = currentSeq.getStartPos();
434 //                      if(startPos > maxStartPos)      {       maxStartPos = startPos; }
435 //
436 //                      int endPos = currentSeq.getEndPos();
437 //                      if(endPos < minEndPos)          {       minEndPos = endPos;             }
438                         referenceSeqs.push_back(currentSeq);
439                                 
440                         m->gobble(referenceFile);
441                 }
442                 referenceFile.close();
443                 numRefs = referenceSeqs.size();
444
445                 
446                 for(int i=0;i<numRefs;i++){
447                         referenceSeqs[i].padToPos(maxStartPos);
448                         referenceSeqs[i].padFromPos(minEndPos);
449                 }
450                 
451                 if(numAmbigSeqs != 0){
452                         m->mothurOut("Warning: " + toString(numAmbigSeqs) + " reference sequences have ambiguous bases, these bases will be ignored\n");
453                 }               
454                 
455         }
456         catch(exception& e) {
457                 m->errorOut(e, "SeqErrorCommand", "getReferences");
458                 exit(1);
459         }
460 }
461
462 //***************************************************************************************************************
463
464 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
465         try {
466                 if(query.getAlignLength() != reference.getAlignLength()){
467                         m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
468                 }
469                 int alignLength = query.getAlignLength();
470         
471                 string q = query.getAligned();
472                 string r = reference.getAligned();
473
474                 int started = 0;
475                 Compare errors;
476
477                 for(int i=0;i<alignLength;i++){
478                         if(r[i] != 'N' && q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                  //      no missing data and no double gaps
479                                 started = 1;
480                                 
481                                 if(q[i] == 'A'){
482                                         if(r[i] == 'A'){        errors.AA++;    errors.matches++;       errors.sequence += 'm'; }
483                                         if(r[i] == 'T'){        errors.AT++;    errors.sequence += 's'; }
484                                         if(r[i] == 'G'){        errors.AG++;    errors.sequence += 's'; }
485                                         if(r[i] == 'C'){        errors.AC++;    errors.sequence += 's'; }
486                                         if(r[i] == '-'){        errors.Ai++;    errors.sequence += 'i'; }
487                                 }
488                                 else if(q[i] == 'T'){
489                                         if(r[i] == 'A'){        errors.TA++;    errors.sequence += 's'; }
490                                         if(r[i] == 'T'){        errors.TT++;    errors.matches++;       errors.sequence += 'm'; }
491                                         if(r[i] == 'G'){        errors.TG++;    errors.sequence += 's'; }
492                                         if(r[i] == 'C'){        errors.TC++;    errors.sequence += 's'; }
493                                         if(r[i] == '-'){        errors.Ti++;    errors.sequence += 'i'; }
494                                 }
495                                 else if(q[i] == 'G'){
496                                         if(r[i] == 'A'){        errors.GA++;    errors.sequence += 's'; }
497                                         if(r[i] == 'T'){        errors.GT++;    errors.sequence += 's'; }
498                                         if(r[i] == 'G'){        errors.GG++;    errors.matches++;       errors.sequence += 'm'; }
499                                         if(r[i] == 'C'){        errors.GC++;    errors.sequence += 's'; }
500                                         if(r[i] == '-'){        errors.Gi++;    errors.sequence += 'i'; }
501                                 }
502                                 else if(q[i] == 'C'){
503                                         if(r[i] == 'A'){        errors.CA++;    errors.sequence += 's'; }
504                                         if(r[i] == 'T'){        errors.CT++;    errors.sequence += 's'; }
505                                         if(r[i] == 'G'){        errors.CG++;    errors.sequence += 's'; }
506                                         if(r[i] == 'C'){        errors.CC++;    errors.matches++;       errors.sequence += 'm'; }
507                                         if(r[i] == '-'){        errors.Ci++;    errors.sequence += 'i'; }
508                                 }
509                                 else if(q[i] == 'N'){
510                                         if(r[i] == 'A'){        errors.NA++;    errors.sequence += 'a'; }
511                                         if(r[i] == 'T'){        errors.NT++;    errors.sequence += 'a'; }
512                                         if(r[i] == 'G'){        errors.NG++;    errors.sequence += 'a'; }
513                                         if(r[i] == 'C'){        errors.NC++;    errors.sequence += 'a'; }
514                                         if(r[i] == '-'){        errors.Ni++;    errors.sequence += 'a'; }
515                                 }
516                                 else if(q[i] == '-' && r[i] != '-'){
517                                         if(r[i] == 'A'){        errors.dA++;    errors.sequence += 'd'; }
518                                         if(r[i] == 'T'){        errors.dT++;    errors.sequence += 'd'; }
519                                         if(r[i] == 'G'){        errors.dG++;    errors.sequence += 'd'; }
520                                         if(r[i] == 'C'){        errors.dC++;    errors.sequence += 'd'; }
521                                 }
522                                 errors.total++; 
523                                 
524                         }
525                         else if(q[i] == '.' && r[i] != '.'){            //      reference extends beyond query
526                                 if(started == 1){       break;  }
527                         }
528                         else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
529                                 if(started == 1){       break;  }
530                         }
531                         else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
532                                 if(started == 1){       break;  }                       
533                         }
534                         
535                 }
536                 errors.mismatches = errors.total-errors.matches;
537                 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
538                 errors.queryName = query.getName();
539                 errors.refName = reference.getName();
540                 
541                 return errors;
542         }
543         catch(exception& e) {
544                 m->errorOut(e, "SeqErrorCommand", "getErrors");
545                 exit(1);
546         }
547 }
548
549 //***************************************************************************************************************
550
551 map<string, int> SeqErrorCommand::getWeights(){
552         ifstream nameFile;
553         m->openInputFile(namesFileName, nameFile);
554         
555         string seqName;
556         string redundantSeqs;
557         map<string, int> nameCountMap;
558         
559         while(nameFile){
560                 nameFile >> seqName >> redundantSeqs;
561                 nameCountMap[seqName] = m->getNumNames(redundantSeqs); 
562                 m->gobble(nameFile);
563         }
564         return nameCountMap;
565 }
566
567
568 //***************************************************************************************************************
569
570 void SeqErrorCommand::printErrorHeader(){
571         try {
572                 errorSummaryFile << "query\treference\tweight\t";
573                 errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
574                 errorSummaryFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\tnumparents\n";
575                 
576                 errorSummaryFile << setprecision(6);
577                 errorSummaryFile.setf(ios::fixed);
578         }
579         catch(exception& e) {
580                 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
581                 exit(1);
582         }
583 }
584
585 //***************************************************************************************************************
586
587 void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs){
588         try {
589
590                 errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
591                 errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
592                 errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
593                 errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
594                 errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
595                 errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
596                 errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t';
597                 errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
598                 
599                 errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                  //insertions
600                 errorSummaryFile << error.dA + error.dT + error.dG + error.dC << '\t';                  //deletions
601                 errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';      //substitutions
602                 errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';       //ambiguities
603                 errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << '\t' << numParentSeqs << endl;
604
605                 errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl;
606                 
607                 int a=0;                int t=1;                int g=2;                int c=3;
608                 int gap=4;              int n=5;
609
610                 if(numParentSeqs == 1 || ignoreChimeras == 0){
611                         substitutionMatrix[a][a] += error.weight * error.AA;
612                         substitutionMatrix[a][t] += error.weight * error.TA;
613                         substitutionMatrix[a][g] += error.weight * error.GA;
614                         substitutionMatrix[a][c] += error.weight * error.CA;
615                         substitutionMatrix[a][gap] += error.weight * error.dA;
616                         substitutionMatrix[a][n] += error.weight * error.NA;
617                         
618                         substitutionMatrix[t][a] += error.weight * error.AT;
619                         substitutionMatrix[t][t] += error.weight * error.TT;
620                         substitutionMatrix[t][g] += error.weight * error.GT;
621                         substitutionMatrix[t][c] += error.weight * error.CT;
622                         substitutionMatrix[t][gap] += error.weight * error.dT;
623                         substitutionMatrix[t][n] += error.weight * error.NT;
624
625                         substitutionMatrix[g][a] += error.weight * error.AG;
626                         substitutionMatrix[g][t] += error.weight * error.TG;
627                         substitutionMatrix[g][g] += error.weight * error.GG;
628                         substitutionMatrix[g][c] += error.weight * error.CG;
629                         substitutionMatrix[g][gap] += error.weight * error.dG;
630                         substitutionMatrix[g][n] += error.weight * error.NG;
631
632                         substitutionMatrix[c][a] += error.weight * error.AC;
633                         substitutionMatrix[c][t] += error.weight * error.TC;
634                         substitutionMatrix[c][g] += error.weight * error.GC;
635                         substitutionMatrix[c][c] += error.weight * error.CC;
636                         substitutionMatrix[c][gap] += error.weight * error.dC;
637                         substitutionMatrix[c][n] += error.weight * error.NC;
638
639                         substitutionMatrix[gap][a] += error.weight * error.Ai;
640                         substitutionMatrix[gap][t] += error.weight * error.Ti;
641                         substitutionMatrix[gap][g] += error.weight * error.Gi;
642                         substitutionMatrix[gap][c] += error.weight * error.Ci;
643                         substitutionMatrix[gap][n] += error.weight * error.Ni;
644                 }
645         }
646         catch(exception& e) {
647                 m->errorOut(e, "SeqErrorCommand", "printErrorData");
648                 exit(1);
649         }
650 }
651
652 //***************************************************************************************************************
653
654 void SeqErrorCommand::printSubMatrix(){
655         try {
656                 string subMatrixFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.matrix";
657                 ofstream subMatrixFile;
658                 m->openOutputFile(subMatrixFileName, subMatrixFile);
659                 outputNames.push_back(subMatrixFileName);  outputTypes["error.matrix"].push_back(subMatrixFileName);
660                 vector<string> bases(6);
661                 bases[0] = "A";
662                 bases[1] = "T";
663                 bases[2] = "G";
664                 bases[3] = "C";
665                 bases[4] = "Gap";
666                 bases[5] = "N";
667                 vector<int> refSums(5,1);
668
669                 for(int i=0;i<5;i++){
670                         subMatrixFile << "\tr" << bases[i];
671                         
672                         for(int j=0;j<6;j++){
673                                 refSums[i] += substitutionMatrix[i][j];                         
674                         }
675                 }
676                 subMatrixFile << endl;
677                 
678                 for(int i=0;i<6;i++){
679                         subMatrixFile << 'q' << bases[i];
680                         for(int j=0;j<5;j++){
681                                 subMatrixFile << '\t' << substitutionMatrix[j][i];                              
682                         }
683                         subMatrixFile << endl;
684                 }
685
686                 subMatrixFile << "total";
687                 for(int i=0;i<5;i++){
688                         subMatrixFile << '\t' << refSums[i];
689                 }
690                 subMatrixFile << endl;
691                 subMatrixFile.close();
692         }
693         catch(exception& e) {
694                 m->errorOut(e, "SeqErrorCommand", "printSubMatrix");
695                 exit(1);
696         }
697 }
698 //***************************************************************************************************************
699
700 void SeqErrorCommand::printErrorFRFile(map<char, vector<int> > errorForward, map<char, vector<int> > errorReverse){
701         try{
702                 string errorForwardFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.forward";
703                 ofstream errorForwardFile;
704                 m->openOutputFile(errorForwardFileName, errorForwardFile);
705                 outputNames.push_back(errorForwardFileName);  outputTypes["error.forward"].push_back(errorForwardFileName);
706
707                 errorForwardFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
708                 for(int i=0;i<maxLength;i++){
709                         float match = (float)errorForward['m'][i];
710                         float subst = (float)errorForward['s'][i];
711                         float insert = (float)errorForward['i'][i];
712                         float del = (float)errorForward['d'][i];
713                         float amb = (float)errorForward['a'][i];
714                         float total = match + subst + insert + del + amb;
715                         if(total == 0){ break;  }
716                         errorForwardFile << i+1 << '\t' << total << '\t' << match/total  << '\t' << subst/total  << '\t' << insert/total  << '\t' << del/total  << '\t' << amb/total << endl;
717                 }
718                 errorForwardFile.close();
719
720                 string errorReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.reverse";
721                 ofstream errorReverseFile;
722                 m->openOutputFile(errorReverseFileName, errorReverseFile);
723                 outputNames.push_back(errorReverseFileName);  outputTypes["error.reverse"].push_back(errorReverseFileName);
724
725                 errorReverseFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
726                 for(int i=0;i<maxLength;i++){
727                         float match = (float)errorReverse['m'][i];
728                         float subst = (float)errorReverse['s'][i];
729                         float insert = (float)errorReverse['i'][i];
730                         float del = (float)errorReverse['d'][i];
731                         float amb = (float)errorReverse['a'][i];
732                         float total = match + subst + insert + del + amb;
733                         if(total == 0){ break;  }
734                         errorReverseFile << i+1 << '\t' << total << '\t' << match/total  << '\t' << subst/total  << '\t' << insert/total  << '\t' << del/total  << '\t' << amb/total << endl;
735                 }
736                 errorReverseFile.close();
737         }
738         catch(exception& e) {
739                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
740                 exit(1);
741         }
742 }
743
744 //***************************************************************************************************************
745
746 void SeqErrorCommand::printErrorQuality(map<char, vector<int> > qScoreErrorMap){
747         try{
748
749                 string errorQualityFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.quality";
750                 ofstream errorQualityFile;
751                 m->openOutputFile(errorQualityFileName, errorQualityFile);
752                 outputNames.push_back(errorQualityFileName);  outputTypes["error.quality"].push_back(errorQualityFileName);
753
754                 errorQualityFile << "qscore\tmatches\tsubstitutions\tinsertions\tambiguous" << endl;
755                 for(int i=0;i<41;i++){
756                         errorQualityFile << i << '\t' << qScoreErrorMap['m'][i] << '\t' << qScoreErrorMap['s'][i] << '\t' << qScoreErrorMap['i'][i] << '\t'<< qScoreErrorMap['a'][i] << endl;
757                 }
758                 errorQualityFile.close();
759         }
760         catch(exception& e) {
761                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
762                 exit(1);
763         }
764 }
765
766
767 //***************************************************************************************************************
768
769 void SeqErrorCommand::printQualityFR(vector<vector<int> > qualForwardMap, vector<vector<int> > qualReverseMap){
770
771         try{
772                 int numRows = 0;
773                 int numColumns = qualForwardMap[0].size();
774
775                 for(int i=0;i<qualForwardMap.size();i++){
776                         for(int j=0;j<numColumns;j++){
777                                 if(qualForwardMap[i][j] != 0){
778                                         if(numRows < i)         {       numRows = i+20;         }
779                                 }
780                         }
781                 }
782
783                 string qualityForwardFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.qual.forward";
784                 ofstream qualityForwardFile;
785                 m->openOutputFile(qualityForwardFileName, qualityForwardFile);
786                 outputNames.push_back(qualityForwardFileName);  outputTypes["error.qual.forward"].push_back(qualityForwardFileName);
787
788                 for(int i=0;i<numColumns;i++){  qualityForwardFile << '\t' << i;        }       qualityForwardFile << endl;
789
790                 for(int i=0;i<numRows;i++){
791                         qualityForwardFile << i+1;
792                         for(int j=0;j<numColumns;j++){
793                                 qualityForwardFile << '\t' << qualForwardMap[i][j];
794                         }
795
796                         qualityForwardFile << endl;
797                 }
798                 qualityForwardFile.close();
799
800                 
801                 string qualityReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.qual.reverse";
802                 ofstream qualityReverseFile;
803                 m->openOutputFile(qualityReverseFileName, qualityReverseFile);
804                 outputNames.push_back(qualityReverseFileName);  outputTypes["error.qual.reverse"].push_back(qualityReverseFileName);
805                 
806                 for(int i=0;i<numColumns;i++){  qualityReverseFile << '\t' << i;        }       qualityReverseFile << endl;
807                 for(int i=0;i<numRows;i++){
808                         
809                         qualityReverseFile << i+1;
810                         for(int j=0;j<numColumns;j++){
811                                 qualityReverseFile << '\t' << qualReverseMap[i][j];
812                         }
813                         qualityReverseFile << endl;
814                 }
815                 qualityReverseFile.close();
816         }
817         catch(exception& e) {
818                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
819                 exit(1);
820         }
821         
822 }
823
824 //***************************************************************************************************************