]> git.donarmstrong.com Git - mothur.git/blobdiff - seqerrorcommand.cpp
fixed bug in seqerrorcommand
[mothur.git] / seqerrorcommand.cpp
index d2c0fcb64f92652a14dee26e40e89ae5dc9a19dd..18b0b72f9ae536e01c889315d2d753b421ee6bb0 100644 (file)
@@ -27,8 +27,7 @@ vector<string> SeqErrorCommand::getValidParameters(){
 //**********************************************************************************************************************
 SeqErrorCommand::SeqErrorCommand(){    
        try {
-               abort = true;
-               //initialize outputTypes
+               abort = true; calledHelp = true; 
                vector<string> tempOutNames;
                outputTypes["error.summary"] = tempOutNames;
                outputTypes["error.seq"] = tempOutNames;
@@ -73,10 +72,10 @@ vector<string> SeqErrorCommand::getRequiredFiles(){
 SeqErrorCommand::SeqErrorCommand(string option)  {
        try {
                
-               abort = false;
+               abort = false; calledHelp = false;   
                
                //allow user to run help
-               if(option == "help") { help(); abort = true; }
+               if(option == "help") { help(); abort = true; calledHelp = true; }
                
                else {
                        string temp;
@@ -197,7 +196,7 @@ SeqErrorCommand::SeqErrorCommand(string option)  {
                        convert(temp, ignoreChimeras);  
 
                        substitutionMatrix.resize(6);
-                       for(int i=0;i<6;i++){   substitutionMatrix[i].assign(6,0);      }
+                       for(int i=0;i<6;i++){   substitutionMatrix[i].resize(6,0);      }
                }
        }
        catch(exception& e) {
@@ -223,16 +222,16 @@ void SeqErrorCommand::help(){
 
 //***************************************************************************************************************
 
-SeqErrorCommand::~SeqErrorCommand(){
-
-}
+SeqErrorCommand::~SeqErrorCommand(){   /*      void    */      }
 
 //***************************************************************************************************************
 
 int SeqErrorCommand::execute(){
        try{
-               if (abort == true) { return 0; }
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 
+               maxLength = 2000;
+               
                string errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary";
                m->openOutputFile(errorSummaryFileName, errorSummaryFile);
                outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName);
@@ -262,11 +261,11 @@ int SeqErrorCommand::execute(){
                        m->openInputFile(qualFileName, qualFile);
                        report = ReportFile(reportFile, reportFileName);
                        
-                       qualForwardMap.resize(1000);
-                       qualReverseMap.resize(1000);
-                       for(int i=0;i<1000;i++){
-                               qualForwardMap[i].assign(100,0);
-                               qualReverseMap[i].assign(100,0);
+                       qualForwardMap.resize(maxLength);
+                       qualReverseMap.resize(maxLength);
+                       for(int i=0;i<maxLength;i++){
+                               qualForwardMap[i].assign(maxLength,0);
+                               qualReverseMap[i].assign(maxLength,0);
                        }                               
                }
                
@@ -285,66 +284,69 @@ int SeqErrorCommand::execute(){
                qScoreErrorMap['a'].assign(41, 0);
                
                map<char, vector<int> > errorForward;
-               errorForward['m'].assign(1000,0);
-               errorForward['s'].assign(1000,0);
-               errorForward['i'].assign(1000,0);
-               errorForward['d'].assign(1000,0);
-               errorForward['a'].assign(1000,0);
+               errorForward['m'].assign(maxLength,0);
+               errorForward['s'].assign(maxLength,0);
+               errorForward['i'].assign(maxLength,0);
+               errorForward['d'].assign(maxLength,0);
+               errorForward['a'].assign(maxLength,0);
                
                map<char, vector<int> > errorReverse;
-               errorReverse['m'].assign(1000,0);
-               errorReverse['s'].assign(1000,0);
-               errorReverse['i'].assign(1000,0);
-               errorReverse['d'].assign(1000,0);
-               errorReverse['a'].assign(1000,0);       
+               errorReverse['m'].assign(maxLength,0);
+               errorReverse['s'].assign(maxLength,0);
+               errorReverse['i'].assign(maxLength,0);
+               errorReverse['d'].assign(maxLength,0);
+               errorReverse['a'].assign(maxLength,0);  
                
 
                string errorChimeraFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.chimera";
                RefChimeraTest chimeraTest(referenceSeqs, errorChimeraFileName);
                outputNames.push_back(errorChimeraFileName); outputTypes["error.chimera"].push_back(errorChimeraFileName);
                
+               vector<string> megaAlignVector(numRefs, "");
+
                int index = 0;
                bool ignoreSeq = 0;
                
                while(queryFile){
-                       
+
                        if (m->control_pressed) { errorSummaryFile.close();     errorSeqFile.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
                
                        Sequence query(queryFile);
-                                               
+                       
                        int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned());
                        int closestRefIndex = chimeraTest.getClosestRefIndex();
 
                        if(numParentSeqs > 1 && ignoreChimeras == 1)    {       ignoreSeq = 1;  }
                        else                                                                                    {       ignoreSeq = 0;  }
 
-
                        Compare minCompare = getErrors(query, referenceSeqs[closestRefIndex]);
                        
                        if(namesFileName != ""){
                                it = weights.find(query.getName());
                                minCompare.weight = it->second;
                        }
-                       else    {       minCompare.weight = 1;  }
+                       else{   minCompare.weight = 1;  }
 
                        printErrorData(minCompare, numParentSeqs);
 
                        if(!ignoreSeq){
+                               
                                for(int i=0;i<minCompare.total;i++){
                                        char letter = minCompare.sequence[i];
+
                                        errorForward[letter][i] += minCompare.weight;
                                        errorReverse[letter][minCompare.total-i-1] += minCompare.weight;                                
                                }
                        }
-                       
+
                        if(qualFileName != "" && reportFileName != ""){
                                report = ReportFile(reportFile);
                                
-                               int origLength = report.getQueryLength();
+//                             int origLength = report.getQueryLength();
                                int startBase = report.getQueryStart();
                                int endBase = report.getQueryEnd();
 
-                               quality = QualityScores(qualFile, origLength);
+                               quality = QualityScores(qualFile);
 
                                if(!ignoreSeq){
                                        quality.updateQScoreErrorMap(qScoreErrorMap, minCompare.sequence, startBase, endBase, minCompare.weight);
@@ -352,7 +354,7 @@ int SeqErrorCommand::execute(){
                                        quality.updateReverseMap(qualReverseMap, startBase, endBase, minCompare.weight);
                                }
                        }                       
-                       
+
                        if(minCompare.errorRate < threshold && !ignoreSeq){
                                totalBases += (minCompare.total * minCompare.weight);
                                totalMatches += minCompare.matches * minCompare.weight;
@@ -362,10 +364,13 @@ int SeqErrorCommand::execute(){
                                }                               
                                misMatchCounts[minCompare.mismatches] += minCompare.weight;
                                numSeqs++;
+                               
+                               megaAlignVector[closestRefIndex] += query.getInlineSeq() + '\n';
                        }
-                       
+
                        index++;
-                       if(index % 1000 == 0){  cout << index << endl;  }
+                       
+                       if(index % 1000 == 0){  m->mothurOut(toString(index) + '\n');   }
                }
                queryFile.close();
                errorSummaryFile.close();       
@@ -397,6 +402,17 @@ int SeqErrorCommand::execute(){
 
                printSubMatrix();
                                
+               string megAlignmentFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.ref-query";
+               ofstream megAlignmentFile;
+               m->openOutputFile(megAlignmentFileName, megAlignmentFile);
+               outputNames.push_back(megAlignmentFileName);  outputTypes["error.ref-query"].push_back(megAlignmentFileName);
+               
+               for(int i=0;i<numRefs;i++){
+                       megAlignmentFile << referenceSeqs[i].getInlineSeq() << endl;
+                       megAlignmentFile << megaAlignVector[i] << endl;
+               }
+               
+               
                m->mothurOutEndLine();
                m->mothurOut("Output File Names: "); m->mothurOutEndLine();
                for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
@@ -418,20 +434,37 @@ void SeqErrorCommand::getReferences(){
                ifstream referenceFile;
                m->openInputFile(referenceFileName, referenceFile);
                
+               int numAmbigSeqs = 0;
+               
+               int maxStartPos = 0;
+               int minEndPos = 100000;
+               
                while(referenceFile){
                        Sequence currentSeq(referenceFile);
                        int numAmbigs = currentSeq.getAmbigBases();
+                       if(numAmbigs > 0){      numAmbigSeqs++; }
                        
-                       if(numAmbigs != 0){
-                               m->mothurOut("Warning: " + toString(currentSeq.getName()) + " has " + toString(numAmbigs) + " ambiguous bases, these bases will be removed\n");
-                               currentSeq.removeAmbigBases();
-                       }
+                       int startPos = currentSeq.getStartPos();
+                       if(startPos > maxStartPos)      {       maxStartPos = startPos; }
+
+                       int endPos = currentSeq.getEndPos();
+                       if(endPos < minEndPos)          {       minEndPos = endPos;             }
                        referenceSeqs.push_back(currentSeq);
                        m->gobble(referenceFile);
                }
+               referenceFile.close();
                numRefs = referenceSeqs.size();
+
+               
+               for(int i=0;i<numRefs;i++){
+                       referenceSeqs[i].padToPos(maxStartPos);
+                       referenceSeqs[i].padFromPos(minEndPos);
+               }
+               
+               if(numAmbigSeqs != 0){
+                       m->mothurOut("Warning: " + toString(numAmbigSeqs) + " reference sequences have ambiguous bases, these bases will be ignored\n");
+               }               
                
-               referenceFile.close();
        }
        catch(exception& e) {
                m->errorOut(e, "SeqErrorCommand", "getReferences");
@@ -455,7 +488,7 @@ Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
                Compare errors;
 
                for(int i=0;i<alignLength;i++){
-                       if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                 //      no missing data and no double gaps
+                       if(r[i] != 'N' && q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){                  //      no missing data and no double gaps
                                started = 1;
                                
                                if(q[i] == 'A'){
@@ -506,7 +539,6 @@ Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
                                if(started == 1){       break;  }
                        }
                        else if(q[i] != '.' && r[i] == '.'){            //      query extends beyond reference
-                               m->mothurOut("Warning: " + toString(query.getName()) + " extend beyond " + toString(reference.getName()) + ".  Ignoring the extra bases in the query\n");
                                if(started == 1){       break;  }
                        }
                        else if(q[i] == '.' && r[i] == '.'){            //      both are missing data
@@ -567,13 +599,14 @@ void SeqErrorCommand::printErrorHeader(){
 
 void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs){
        try {
+
                errorSummaryFile << error.queryName << '\t' << error.refName << '\t' << error.weight << '\t';
                errorSummaryFile << error.AA << '\t' << error.AT << '\t' << error.AG << '\t' << error.AC << '\t';
                errorSummaryFile << error.TA << '\t' << error.TT << '\t' << error.TG << '\t' << error.TC << '\t';
                errorSummaryFile << error.GA << '\t' << error.GT << '\t' << error.GG << '\t' << error.GC << '\t';
                errorSummaryFile << error.CA << '\t' << error.CT << '\t' << error.CG << '\t' << error.CC << '\t';
                errorSummaryFile << error.NA << '\t' << error.NT << '\t' << error.NG << '\t' << error.NC << '\t';
-               errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t' ;
+               errorSummaryFile << error.Ai << '\t' << error.Ti << '\t' << error.Gi << '\t' << error.Ci << '\t' << error.Ni << '\t';
                errorSummaryFile << error.dA << '\t' << error.dT << '\t' << error.dG << '\t' << error.dC << '\t';
                
                errorSummaryFile << error.Ai + error.Ti + error.Gi + error.Ci << '\t';                  //insertions
@@ -581,12 +614,12 @@ void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs){
                errorSummaryFile << error.mismatches - (error.Ai + error.Ti + error.Gi + error.Ci) - (error.dA + error.dT + error.dG + error.dC) - (error.NA + error.NT + error.NG + error.NC + error.Ni) << '\t';      //substitutions
                errorSummaryFile << error.NA + error.NT + error.NG + error.NC + error.Ni << '\t';       //ambiguities
                errorSummaryFile << error.matches << '\t' << error.mismatches << '\t' << error.total << '\t' << error.errorRate << '\t' << numParentSeqs << endl;
-               
+
                errorSeqFile << '>' << error.queryName << "\tref:" << error.refName << '\n' << error.sequence << endl;
                
-               
                int a=0;                int t=1;                int g=2;                int c=3;
                int gap=4;              int n=5;
+
                if(numParentSeqs == 1 || ignoreChimeras == 0){
                        substitutionMatrix[a][a] += error.weight * error.AA;
                        substitutionMatrix[a][t] += error.weight * error.TA;
@@ -594,7 +627,7 @@ void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs){
                        substitutionMatrix[a][c] += error.weight * error.CA;
                        substitutionMatrix[a][gap] += error.weight * error.dA;
                        substitutionMatrix[a][n] += error.weight * error.NA;
-
+                       
                        substitutionMatrix[t][a] += error.weight * error.AT;
                        substitutionMatrix[t][t] += error.weight * error.TT;
                        substitutionMatrix[t][g] += error.weight * error.GT;
@@ -685,7 +718,7 @@ void SeqErrorCommand::printErrorFRFile(map<char, vector<int> > errorForward, map
                outputNames.push_back(errorForwardFileName);  outputTypes["error.forward"].push_back(errorForwardFileName);
 
                errorForwardFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
-               for(int i=0;i<1000;i++){
+               for(int i=0;i<maxLength;i++){
                        float match = (float)errorForward['m'][i];
                        float subst = (float)errorForward['s'][i];
                        float insert = (float)errorForward['i'][i];
@@ -703,7 +736,7 @@ void SeqErrorCommand::printErrorFRFile(map<char, vector<int> > errorForward, map
                outputNames.push_back(errorReverseFileName);  outputTypes["error.reverse"].push_back(errorReverseFileName);
 
                errorReverseFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl;
-               for(int i=0;i<1000;i++){
+               for(int i=0;i<maxLength;i++){
                        float match = (float)errorReverse['m'][i];
                        float subst = (float)errorReverse['s'][i];
                        float insert = (float)errorReverse['i'][i];