]> git.donarmstrong.com Git - mothur.git/blob - seqerrorcommand.cpp
some alterations to chimera.slayer
[mothur.git] / seqerrorcommand.cpp
1 /*
2  *  seqerrorcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 7/15/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "seqerrorcommand.h"
11 #include "reportfile.h"
12 #include "qualityscores.h"
13 #include "refchimeratest.h"
14
15 //**********************************************************************************************************************
16 vector<string> SeqErrorCommand::setParameters(){        
17         try {
18                 CommandParameter pquery("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pquery);
19                 CommandParameter preference("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(preference);
20                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(pqfile);
21                 CommandParameter preport("report", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(preport);
22                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
23                 CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pignorechimeras);
24                 CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pthreshold);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "SeqErrorCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string SeqErrorCommand::getHelpString(){        
39         try {
40                 string helpString = "";
41                 helpString += "The seq.error command reads a query alignment file and a reference alignment file and creates .....\n";
42                 helpString += "Example seq.error(...).\n";
43                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
44                 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n";
45                 return helpString;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "SeqErrorCommand", "getHelpString");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 SeqErrorCommand::SeqErrorCommand(){     
54         try {
55                 abort = true; calledHelp = true; 
56                 vector<string> tempOutNames;
57                 outputTypes["error.summary"] = tempOutNames;
58                 outputTypes["error.seq"] = tempOutNames;
59                 outputTypes["error.quality"] = tempOutNames;
60                 outputTypes["error.qual.forward"] = tempOutNames;
61                 outputTypes["error.qual.reverse"] = tempOutNames;
62                 outputTypes["error.forward"] = tempOutNames;
63                 outputTypes["error.reverse"] = tempOutNames;
64                 outputTypes["error.count"] = tempOutNames;
65                 outputTypes["error.matrix"] = tempOutNames;
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
69                 exit(1);
70         }
71 }
72 //***************************************************************************************************************
73
74 SeqErrorCommand::SeqErrorCommand(string option)  {
75         try {
76                 
77                 abort = false; calledHelp = false;   
78                 
79                 //allow user to run help
80                 if(option == "help") { help(); abort = true; calledHelp = true; }
81                 
82                 else {
83                         string temp;
84                         vector<string> myArray = setParameters();
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter;
90                         map<string,string>::iterator it;
91                         
92                         //check to make sure all parameters are valid for command
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
95                         }
96                         
97                         //initialize outputTypes
98                         vector<string> tempOutNames;
99                         outputTypes["error.summary"] = tempOutNames;
100                         outputTypes["error.seq"] = tempOutNames;
101                         outputTypes["error.quality"] = tempOutNames;
102                         outputTypes["error.qual.forward"] = tempOutNames;
103                         outputTypes["error.qual.reverse"] = tempOutNames;
104                         outputTypes["error.forward"] = tempOutNames;
105                         outputTypes["error.reverse"] = tempOutNames;
106                         outputTypes["error.count"] = tempOutNames;
107                         outputTypes["error.matrix"] = tempOutNames;
108
109                         
110                         //if the user changes the input directory command factory will send this info to us in the output parameter 
111                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
112                         if (inputDir == "not found"){   inputDir = "";          }
113                         else {
114                                 string path;
115                                 it = parameters.find("fasta");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
121                                 }
122                                 
123                                 it = parameters.find("reference");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["reference"] = inputDir + it->second;                }
129                                 }
130                                 
131                                 it = parameters.find("name");
132                                 //user has given a names file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
137                                 }
138
139                                 it = parameters.find("qfile");
140                                 //user has given a quality score file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
145                                 }
146                                 
147                                 it = parameters.find("report");
148                                 //user has given a alignment report file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["report"] = inputDir + it->second;           }
153                                 }
154                                 
155                         }
156                         //check for required parameters
157                         queryFileName = validParameter.validFile(parameters, "fasta", true);
158                         if (queryFileName == "not found") { 
159                                 queryFileName = m->getFastaFile(); 
160                                 if (queryFileName != "") { m->mothurOut("Using " + queryFileName + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
161                                 else {  m->mothurOut("You have no current fasta file and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
162                         }
163                         else if (queryFileName == "not open") { abort = true; } 
164                         
165                         referenceFileName = validParameter.validFile(parameters, "reference", true);
166                         if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; }
167                         else if (referenceFileName == "not open") { abort = true; }     
168                         
169
170                         //check for optional parameters
171                         namesFileName = validParameter.validFile(parameters, "name", true);
172                         if(namesFileName == "not found"){       namesFileName = "";     }
173                         else if (namesFileName == "not open") { namesFileName = ""; abort = true; }     
174                         
175                         qualFileName = validParameter.validFile(parameters, "qfile", true);
176                         if(qualFileName == "not found"){        qualFileName = "";      }
177                         else if (qualFileName == "not open") { qualFileName = ""; abort = true; }       
178
179                         reportFileName = validParameter.validFile(parameters, "report", true);
180                         if(reportFileName == "not found"){      reportFileName = "";    }
181                         else if (reportFileName == "not open") { reportFileName = ""; abort = true; }   
182                         
183                         if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){
184                                 m->mothurOut("if you use either a qual file or a report file, you have to have both.");
185                                 m->mothurOutEndLine();
186                                 abort = true; 
187                         }
188                         
189                         outputDir = validParameter.validFile(parameters, "outputdir", false);
190                         if (outputDir == "not found"){  
191                                 outputDir = ""; 
192                                 outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it   
193                         }
194                         
195                         //check for optional parameter and set defaults
196                         // ...at some point should added some additional type checking...
197                         temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found") { temp = "1.00"; }
198                         convert(temp, threshold);  
199                         
200                         temp = validParameter.validFile(parameters, "ignorechimeras", false);   if (temp == "not found") { temp = "1"; }
201                         convert(temp, ignoreChimeras);  
202
203                         substitutionMatrix.resize(6);
204                         for(int i=0;i<6;i++){   substitutionMatrix[i].resize(6,0);      }
205                 }
206         }
207         catch(exception& e) {
208                 m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand");
209                 exit(1);
210         }
211 }
212 //***************************************************************************************************************
213
214 int SeqErrorCommand::execute(){
215         try{
216                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
217
218                 maxLength = 2000;
219                 
220                 string errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary";
221                 m->openOutputFile(errorSummaryFileName, errorSummaryFile);
222                 outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName);
223                 printErrorHeader();
224                 
225                 string errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq";
226                 m->openOutputFile(errorSeqFileName, errorSeqFile);
227                 outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName);
228
229                 getReferences();        //read in reference sequences - make sure there's no ambiguous bases
230
231                 map<string, int> weights;
232                 if(namesFileName != ""){        weights = getWeights(); }
233                 
234                 ifstream queryFile;
235                 m->openInputFile(queryFileName, queryFile);
236                 
237                 ifstream reportFile;
238                 ifstream qualFile;
239
240                 ReportFile report;
241                 QualityScores quality;
242                 vector<vector<int> > qualForwardMap;
243                 vector<vector<int> > qualReverseMap;
244                 
245                 if(qualFileName != "" && reportFileName != ""){
246                         m->openInputFile(qualFileName, qualFile);
247                         report = ReportFile(reportFile, reportFileName);
248                         
249                         qualForwardMap.resize(maxLength);
250                         qualReverseMap.resize(maxLength);
251                         for(int i=0;i<maxLength;i++){
252                                 qualForwardMap[i].assign(41,0);
253                                 qualReverseMap[i].assign(41,0);
254                         }                               
255                 }
256                 
257                 int totalBases = 0;
258                 int totalMatches = 0;
259                 
260                 vector<int> misMatchCounts(11, 0);
261                 int maxMismatch = 0;
262                 int numSeqs = 0;
263                 
264                 map<string, int>::iterator it;
265                 map<char, vector<int> > qScoreErrorMap;
266                 qScoreErrorMap['m'].assign(41, 0);
267                 qScoreErrorMap['s'].assign(41, 0);
268                 qScoreErrorMap['i'].assign(41, 0);
269                 qScoreErrorMap['a'].assign(41, 0);
270                 
271                 map<char, vector<int> > errorForward;
272                 errorForward['m'].assign(maxLength,0);
273                 errorForward['s'].assign(maxLength,0);
274                 errorForward['i'].assign(maxLength,0);
275                 errorForward['d'].assign(maxLength,0);
276                 errorForward['a'].assign(maxLength,0);
277                 
278                 map<char, vector<int> > errorReverse;
279                 errorReverse['m'].assign(maxLength,0);
280                 errorReverse['s'].assign(maxLength,0);
281                 errorReverse['i'].assign(maxLength,0);
282                 errorReverse['d'].assign(maxLength,0);
283                 errorReverse['a'].assign(maxLength,0);  
284                 
285
286                 string errorChimeraFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.chimera";
287                 RefChimeraTest chimeraTest(referenceSeqs, errorChimeraFileName);
288                 outputNames.push_back(errorChimeraFileName); outputTypes["error.chimera"].push_back(errorChimeraFileName);
289                 
290                 vector<string> megaAlignVector(numRefs, "");
291
292                 int index = 0;
293                 bool ignoreSeq = 0;
294                 
295                 while(queryFile){
296
297                         if (m->control_pressed) { errorSummaryFile.close();     errorSeqFile.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
298                 
299                         Sequence query(queryFile);
300                         
301                         int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned());
302                         int closestRefIndex = chimeraTest.getClosestRefIndex();
303
304                         if(numParentSeqs > 1 && ignoreChimeras == 1)    {       ignoreSeq = 1;  }
305                         else                                                                                    {       ignoreSeq = 0;  }
306
307                         Compare minCompare = getErrors(query, referenceSeqs[closestRefIndex]);
308                         
309                         if(namesFileName != ""){
310                                 it = weights.find(query.getName());
311                                 minCompare.weight = it->second;
312                         }
313                         else{   minCompare.weight = 1;  }
314
315                         printErrorData(minCompare, numParentSeqs);
316                 
317                         if(!ignoreSeq){
318
319                                 for(int i=0;i<minCompare.sequence.length();i++){
320                                         char letter = minCompare.sequence[i];
321
322                                         errorForward[letter][i] += minCompare.weight;
323                                         errorReverse[letter][minCompare.total-i-1] += minCompare.weight;                                
324                                 }
325                         }
326
327                         if(qualFileName != "" && reportFileName != ""){
328                                 report = ReportFile(reportFile);
329                                 
330 //                              int origLength = report.getQueryLength();
331                                 int startBase = report.getQueryStart();
332                                 int endBase = report.getQueryEnd();
333
334                                 quality = QualityScores(qualFile);
335
336                                 if(!ignoreSeq){
337                                         quality.updateQScoreErrorMap(qScoreErrorMap, minCompare.sequence, startBase, endBase, minCompare.weight);
338                                         quality.updateForwardMap(qualForwardMap, startBase, endBase, minCompare.weight);
339                                         quality.updateReverseMap(qualReverseMap, startBase, endBase, minCompare.weight);
340                                 }
341                         }                       
342
343                         if(minCompare.errorRate < threshold && !ignoreSeq){
344                                 totalBases += (minCompare.total * minCompare.weight);
345                                 totalMatches += minCompare.matches * minCompare.weight;
346                                 if(minCompare.mismatches > maxMismatch){
347                                         maxMismatch = minCompare.mismatches;
348                                         misMatchCounts.resize(maxMismatch + 1, 0);
349                                 }                               
350                                 misMatchCounts[minCompare.mismatches] += minCompare.weight;
351                                 numSeqs++;
352                                 
353                                 megaAlignVector[closestRefIndex] += query.getInlineSeq() + '\n';
354                         }
355
356                         index++;
357                         
358                         if(index % 1000 == 0){  m->mothurOut(toString(index) + '\n');   }
359                 }
360                 queryFile.close();
361                 errorSummaryFile.close();       
362                 errorSeqFile.close();
363
364                 if(qualFileName != "" && reportFileName != ""){         
365                         printErrorQuality(qScoreErrorMap);
366                         printQualityFR(qualForwardMap, qualReverseMap);
367                 }
368                 
369                 printErrorFRFile(errorForward, errorReverse);
370                 
371                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
372
373                 string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count";
374                 ofstream errorCountFile;
375                 m->openOutputFile(errorCountFileName, errorCountFile);
376                 outputNames.push_back(errorCountFileName);  outputTypes["error.count"].push_back(errorCountFileName);
377                 m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n");
378                 m->mothurOut("Errors\tSequences\n");
379                 errorCountFile << "Errors\tSequences\n";                
380                 for(int i=0;i<misMatchCounts.size();i++){
381                         m->mothurOut(toString(i) + '\t' + toString(misMatchCounts[i]) + '\n');
382                         errorCountFile << i << '\t' << misMatchCounts[i] << endl;
383                 }
384                 errorCountFile.close();
385                 
386                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
387
388                 printSubMatrix();
389                                 
390                 string megAlignmentFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.ref-query";
391                 ofstream megAlignmentFile;
392                 m->openOutputFile(megAlignmentFileName, megAlignmentFile);
393                 outputNames.push_back(megAlignmentFileName);  outputTypes["error.ref-query"].push_back(megAlignmentFileName);
394                 
395                 for(int i=0;i<numRefs;i++){
396                         megAlignmentFile << referenceSeqs[i].getInlineSeq() << endl;
397                         megAlignmentFile << megaAlignVector[i] << endl;
398                 }
399                 
400                 
401                 m->mothurOutEndLine();
402                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
403                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
404                 m->mothurOutEndLine();
405                 
406                 return 0;       
407         }
408         catch(exception& e) {
409                 m->errorOut(e, "SeqErrorCommand", "execute");
410                 exit(1);
411         }
412 }
413
414 //***************************************************************************************************************
415
416 void SeqErrorCommand::getReferences(){
417         try {
418                 
419                 ifstream referenceFile;
420                 m->openInputFile(referenceFileName, referenceFile);
421                 
422                 int numAmbigSeqs = 0;
423                 
424                 int maxStartPos = 0;
425                 int minEndPos = 100000;
426                 
427                 while(referenceFile){
428                         Sequence currentSeq(referenceFile);
429                         int numAmbigs = currentSeq.getAmbigBases();
430                         if(numAmbigs > 0){      numAmbigSeqs++; }
431                         
432 //                      int startPos = currentSeq.getStartPos();
433 //                      if(startPos > maxStartPos)      {       maxStartPos = startPos; }
434 //
435 //                      int endPos = currentSeq.getEndPos();
436 //                      if(endPos < minEndPos)          {       minEndPos = endPos;             }
437                         referenceSeqs.push_back(currentSeq);
438                                 
439                         m->gobble(referenceFile);
440                 }
441                 referenceFile.close();
442                 numRefs = referenceSeqs.size();
443
444                 
445                 for(int i=0;i<numRefs;i++){
446                         referenceSeqs[i].padToPos(maxStartPos);
447                         referenceSeqs[i].padFromPos(minEndPos);
448                 }
449                 
450                 if(numAmbigSeqs != 0){
451                         m->mothurOut("Warning: " + toString(numAmbigSeqs) + " reference sequences have ambiguous bases, these bases will be ignored\n");
452                 }               
453                 
454         }
455         catch(exception& e) {
456                 m->errorOut(e, "SeqErrorCommand", "getReferences");
457                 exit(1);
458         }
459 }
460
461 //***************************************************************************************************************
462
463 Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
464         try {
465                 if(query.getAlignLength() != reference.getAlignLength()){
466                         m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
467                 }
468                 int alignLength = query.getAlignLength();
469         
470                 string q = query.getAligned();
471                 string r = reference.getAligned();
472
473                 int started = 0;
474                 Compare errors;
475
476                 for(int i=0;i<alignLength;i++){
477                         if(r[i] != 'N' && q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                  //      no missing data and no double gaps
478                                 started = 1;
479                                 
480                                 if(q[i] == 'A'){
481                                         if(r[i] == 'A'){        errors.AA++;    errors.matches++;       errors.sequence += 'm'; }
482                                         if(r[i] == 'T'){        errors.AT++;    errors.sequence += 's'; }
483                                         if(r[i] == 'G'){        errors.AG++;    errors.sequence += 's'; }
484                                         if(r[i] == 'C'){        errors.AC++;    errors.sequence += 's'; }
485                                         if(r[i] == '-'){        errors.Ai++;    errors.sequence += 'i'; }
486                                 }
487                                 else if(q[i] == 'T'){
488                                         if(r[i] == 'A'){        errors.TA++;    errors.sequence += 's'; }
489                                         if(r[i] == 'T'){        errors.TT++;    errors.matches++;       errors.sequence += 'm'; }
490                                         if(r[i] == 'G'){        errors.TG++;    errors.sequence += 's'; }
491                                         if(r[i] == 'C'){        errors.TC++;    errors.sequence += 's'; }
492                                         if(r[i] == '-'){        errors.Ti++;    errors.sequence += 'i'; }
493                                 }
494                                 else if(q[i] == 'G'){
495                                         if(r[i] == 'A'){        errors.GA++;    errors.sequence += 's'; }
496                                         if(r[i] == 'T'){        errors.GT++;    errors.sequence += 's'; }
497                                         if(r[i] == 'G'){        errors.GG++;    errors.matches++;       errors.sequence += 'm'; }
498                                         if(r[i] == 'C'){        errors.GC++;    errors.sequence += 's'; }
499                                         if(r[i] == '-'){        errors.Gi++;    errors.sequence += 'i'; }
500                                 }
501                                 else if(q[i] == 'C'){
502                                         if(r[i] == 'A'){        errors.CA++;    errors.sequence += 's'; }
503                                         if(r[i] == 'T'){        errors.CT++;    errors.sequence += 's'; }
504                                         if(r[i] == 'G'){        errors.CG++;    errors.sequence += 's'; }
505                                         if(r[i] == 'C'){        errors.CC++;    errors.matches++;       errors.sequence += 'm'; }
506                                         if(r[i] == '-'){        errors.Ci++;    errors.sequence += 'i'; }
507                                 }
508                                 else if(q[i] == 'N'){
509                                         if(r[i] == 'A'){        errors.NA++;    errors.sequence += 'a'; }
510                                         if(r[i] == 'T'){        errors.NT++;    errors.sequence += 'a'; }
511                                         if(r[i] == 'G'){        errors.NG++;    errors.sequence += 'a'; }
512                                         if(r[i] == 'C'){        errors.NC++;    errors.sequence += 'a'; }
513                                         if(r[i] == '-'){        errors.Ni++;    errors.sequence += 'a'; }
514                                 }
515                                 else if(q[i] == '-' && r[i] != '-'){
516                                         if(r[i] == 'A'){        errors.dA++;    errors.sequence += 'd'; }
517                                         if(r[i] == 'T'){        errors.dT++;    errors.sequence += 'd'; }
518                                         if(r[i] == 'G'){        errors.dG++;    errors.sequence += 'd'; }
519                                         if(r[i] == 'C'){        errors.dC++;    errors.sequence += 'd'; }
520                                 }
521                                 errors.total++; 
522                                 
523                         }
524                         else if(q[i] == '.' && r[i] != '.'){            //      reference extends beyond query
525                                 if(started == 1){       break;  }
526                         }
527                         else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
528                                 if(started == 1){       break;  }
529                         }
530                         else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
531                                 if(started == 1){       break;  }                       
532                         }
533                         
534                 }
535                 errors.mismatches = errors.total-errors.matches;
536                 errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
537                 errors.queryName = query.getName();
538                 errors.refName = reference.getName();
539                 
540                 return errors;
541         }
542         catch(exception& e) {
543                 m->errorOut(e, "SeqErrorCommand", "getErrors");
544                 exit(1);
545         }
546 }
547
548 //***************************************************************************************************************
549
550 map<string, int> SeqErrorCommand::getWeights(){
551         ifstream nameFile;
552         m->openInputFile(namesFileName, nameFile);
553         
554         string seqName;
555         string redundantSeqs;
556         map<string, int> nameCountMap;
557         
558         while(nameFile){
559                 nameFile >> seqName >> redundantSeqs;
560                 nameCountMap[seqName] = m->getNumNames(redundantSeqs); 
561                 m->gobble(nameFile);
562         }
563         return nameCountMap;
564 }
565
566
567 //***************************************************************************************************************
568
569 void SeqErrorCommand::printErrorHeader(){
570         try {
571                 errorSummaryFile << "query\treference\tweight\t";
572                 errorSummaryFile << "AA\tAT\tAG\tAC\tTA\tTT\tTG\tTC\tGA\tGT\tGG\tGC\tCA\tCT\tCG\tCC\tNA\tNT\tNG\tNC\tAi\tTi\tGi\tCi\tNi\tdA\tdT\tdG\tdC\t";
573                 errorSummaryFile << "insertions\tdeletions\tsubstitutions\tambig\tmatches\tmismatches\ttotal\terror\tnumparents\n";
574                 
575                 errorSummaryFile << setprecision(6);
576                 errorSummaryFile.setf(ios::fixed);
577         }
578         catch(exception& e) {
579                 m->errorOut(e, "SeqErrorCommand", "printErrorHeader");
580                 exit(1);
581         }
582 }
583
584 //***************************************************************************************************************
585
586 void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs){
587         try {
588
589                 errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
590                 errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
591                 errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
592                 errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
593                 errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
594                 errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
595                 errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t';
596                 errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
597                 
598                 errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                  //insertions
599                 errorSummaryFile << error.dA + error.dT + error.dG + error.dC << '\t';                  //deletions
600                 errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';      //substitutions
601                 errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';       //ambiguities
602                 errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << '\t' << numParentSeqs << endl;
603
604                 errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl;
605                 
606                 int a=0;                int t=1;                int g=2;                int c=3;
607                 int gap=4;              int n=5;
608
609                 if(numParentSeqs == 1 || ignoreChimeras == 0){
610                         substitutionMatrix[a][a] += error.weight * error.AA;
611                         substitutionMatrix[a][t] += error.weight * error.TA;
612                         substitutionMatrix[a][g] += error.weight * error.GA;
613                         substitutionMatrix[a][c] += error.weight * error.CA;
614                         substitutionMatrix[a][gap] += error.weight * error.dA;
615                         substitutionMatrix[a][n] += error.weight * error.NA;
616                         
617                         substitutionMatrix[t][a] += error.weight * error.AT;
618                         substitutionMatrix[t][t] += error.weight * error.TT;
619                         substitutionMatrix[t][g] += error.weight * error.GT;
620                         substitutionMatrix[t][c] += error.weight * error.CT;
621                         substitutionMatrix[t][gap] += error.weight * error.dT;
622                         substitutionMatrix[t][n] += error.weight * error.NT;
623
624                         substitutionMatrix[g][a] += error.weight * error.AG;
625                         substitutionMatrix[g][t] += error.weight * error.TG;
626                         substitutionMatrix[g][g] += error.weight * error.GG;
627                         substitutionMatrix[g][c] += error.weight * error.CG;
628                         substitutionMatrix[g][gap] += error.weight * error.dG;
629                         substitutionMatrix[g][n] += error.weight * error.NG;
630
631                         substitutionMatrix[c][a] += error.weight * error.AC;
632                         substitutionMatrix[c][t] += error.weight * error.TC;
633                         substitutionMatrix[c][g] += error.weight * error.GC;
634                         substitutionMatrix[c][c] += error.weight * error.CC;
635                         substitutionMatrix[c][gap] += error.weight * error.dC;
636                         substitutionMatrix[c][n] += error.weight * error.NC;
637
638                         substitutionMatrix[gap][a] += error.weight * error.Ai;
639                         substitutionMatrix[gap][t] += error.weight * error.Ti;
640                         substitutionMatrix[gap][g] += error.weight * error.Gi;
641                         substitutionMatrix[gap][c] += error.weight * error.Ci;
642                         substitutionMatrix[gap][n] += error.weight * error.Ni;
643                 }
644         }
645         catch(exception& e) {
646                 m->errorOut(e, "SeqErrorCommand", "printErrorData");
647                 exit(1);
648         }
649 }
650
651 //***************************************************************************************************************
652
653 void SeqErrorCommand::printSubMatrix(){
654         try {
655                 string subMatrixFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.matrix";
656                 ofstream subMatrixFile;
657                 m->openOutputFile(subMatrixFileName, subMatrixFile);
658                 outputNames.push_back(subMatrixFileName);  outputTypes["error.matrix"].push_back(subMatrixFileName);
659                 vector<string> bases(6);
660                 bases[0] = "A";
661                 bases[1] = "T";
662                 bases[2] = "G";
663                 bases[3] = "C";
664                 bases[4] = "Gap";
665                 bases[5] = "N";
666                 vector<int> refSums(5,1);
667
668                 for(int i=0;i<5;i++){
669                         subMatrixFile << "\tr" << bases[i];
670                         
671                         for(int j=0;j<6;j++){
672                                 refSums[i] += substitutionMatrix[i][j];                         
673                         }
674                 }
675                 subMatrixFile << endl;
676                 
677                 for(int i=0;i<6;i++){
678                         subMatrixFile << 'q' << bases[i];
679                         for(int j=0;j<5;j++){
680                                 subMatrixFile << '\t' << substitutionMatrix[j][i];                              
681                         }
682                         subMatrixFile << endl;
683                 }
684
685                 subMatrixFile << "total";
686                 for(int i=0;i<5;i++){
687                         subMatrixFile << '\t' << refSums[i];
688                 }
689                 subMatrixFile << endl;
690                 subMatrixFile.close();
691         }
692         catch(exception& e) {
693                 m->errorOut(e, "SeqErrorCommand", "printSubMatrix");
694                 exit(1);
695         }
696 }
697 //***************************************************************************************************************
698
699 void SeqErrorCommand::printErrorFRFile(map<char, vector<int> > errorForward, map<char, vector<int> > errorReverse){
700         try{
701                 string errorForwardFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.forward";
702                 ofstream errorForwardFile;
703                 m->openOutputFile(errorForwardFileName, errorForwardFile);
704                 outputNames.push_back(errorForwardFileName);  outputTypes["error.forward"].push_back(errorForwardFileName);
705
706                 errorForwardFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
707                 for(int i=0;i<maxLength;i++){
708                         float match = (float)errorForward['m'][i];
709                         float subst = (float)errorForward['s'][i];
710                         float insert = (float)errorForward['i'][i];
711                         float del = (float)errorForward['d'][i];
712                         float amb = (float)errorForward['a'][i];
713                         float total = match + subst + insert + del + amb;
714                         if(total == 0){ break;  }
715                         errorForwardFile << i+1 << '\t' << total << '\t' << match/total  << '\t' << subst/total  << '\t' << insert/total  << '\t' << del/total  << '\t' << amb/total << endl;
716                 }
717                 errorForwardFile.close();
718
719                 string errorReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.reverse";
720                 ofstream errorReverseFile;
721                 m->openOutputFile(errorReverseFileName, errorReverseFile);
722                 outputNames.push_back(errorReverseFileName);  outputTypes["error.reverse"].push_back(errorReverseFileName);
723
724                 errorReverseFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
725                 for(int i=0;i<maxLength;i++){
726                         float match = (float)errorReverse['m'][i];
727                         float subst = (float)errorReverse['s'][i];
728                         float insert = (float)errorReverse['i'][i];
729                         float del = (float)errorReverse['d'][i];
730                         float amb = (float)errorReverse['a'][i];
731                         float total = match + subst + insert + del + amb;
732                         if(total == 0){ break;  }
733                         errorReverseFile << i+1 << '\t' << total << '\t' << match/total  << '\t' << subst/total  << '\t' << insert/total  << '\t' << del/total  << '\t' << amb/total << endl;
734                 }
735                 errorReverseFile.close();
736         }
737         catch(exception& e) {
738                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
739                 exit(1);
740         }
741 }
742
743 //***************************************************************************************************************
744
745 void SeqErrorCommand::printErrorQuality(map<char, vector<int> > qScoreErrorMap){
746         try{
747
748                 string errorQualityFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.quality";
749                 ofstream errorQualityFile;
750                 m->openOutputFile(errorQualityFileName, errorQualityFile);
751                 outputNames.push_back(errorQualityFileName);  outputTypes["error.quality"].push_back(errorQualityFileName);
752
753                 errorQualityFile << "qscore\tmatches\tsubstitutions\tinsertions\tambiguous" << endl;
754                 for(int i=0;i<41;i++){
755                         errorQualityFile << i << '\t' << qScoreErrorMap['m'][i] << '\t' << qScoreErrorMap['s'][i] << '\t' << qScoreErrorMap['i'][i] << '\t'<< qScoreErrorMap['a'][i] << endl;
756                 }
757                 errorQualityFile.close();
758         }
759         catch(exception& e) {
760                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
761                 exit(1);
762         }
763 }
764
765
766 //***************************************************************************************************************
767
768 void SeqErrorCommand::printQualityFR(vector<vector<int> > qualForwardMap, vector<vector<int> > qualReverseMap){
769
770         try{
771                 int numRows = 0;
772                 int numColumns = qualForwardMap[0].size();
773
774                 for(int i=0;i<qualForwardMap.size();i++){
775                         for(int j=0;j<numColumns;j++){
776                                 if(qualForwardMap[i][j] != 0){
777                                         if(numRows < i)         {       numRows = i+20;         }
778                                 }
779                         }
780                 }
781
782                 string qualityForwardFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.qual.forward";
783                 ofstream qualityForwardFile;
784                 m->openOutputFile(qualityForwardFileName, qualityForwardFile);
785                 outputNames.push_back(qualityForwardFileName);  outputTypes["error.qual.forward"].push_back(qualityForwardFileName);
786
787                 for(int i=0;i<numColumns;i++){  qualityForwardFile << '\t' << i;        }       qualityForwardFile << endl;
788
789                 for(int i=0;i<numRows;i++){
790                         qualityForwardFile << i+1;
791                         for(int j=0;j<numColumns;j++){
792                                 qualityForwardFile << '\t' << qualForwardMap[i][j];
793                         }
794
795                         qualityForwardFile << endl;
796                 }
797                 qualityForwardFile.close();
798
799                 
800                 string qualityReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.qual.reverse";
801                 ofstream qualityReverseFile;
802                 m->openOutputFile(qualityReverseFileName, qualityReverseFile);
803                 outputNames.push_back(qualityReverseFileName);  outputTypes["error.qual.reverse"].push_back(qualityReverseFileName);
804                 
805                 for(int i=0;i<numColumns;i++){  qualityReverseFile << '\t' << i;        }       qualityReverseFile << endl;
806                 for(int i=0;i<numRows;i++){
807                         
808                         qualityReverseFile << i+1;
809                         for(int j=0;j<numColumns;j++){
810                                 qualityReverseFile << '\t' << qualReverseMap[i][j];
811                         }
812                         qualityReverseFile << endl;
813                 }
814                 qualityReverseFile.close();
815         }
816         catch(exception& e) {
817                 m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
818                 exit(1);
819         }
820         
821 }
822
823 //***************************************************************************************************************