]> git.donarmstrong.com Git - mothur.git/blobdiff - makecontigscommand.cpp
major change to the tree class to use the count table class instead of tree map....
[mothur.git] / makecontigscommand.cpp
index 560b39cdd6adfaec6fab5bafc8e30175038c0b9c..691d706ab00efa8823412753b72f08e5198873b1 100644 (file)
@@ -18,6 +18,7 @@ vector<string> MakeContigsCommand::setParameters(){
                CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
                CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen);
                CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend);
+        CommandParameter pthreshold("threshold", "Number", "", "40", "", "", "",false,false); parameters.push_back(pthreshold);
                CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -37,12 +38,13 @@ string MakeContigsCommand::getHelpString(){
                string helpString = "";
                helpString += "The make.contigs command reads a forward fastq file and a reverse fastq file and outputs new fasta and quality files.\n";
                helpString += "The make.contigs command parameters are ffastq, rfastq, align, match, mismatch, gapopen, gapextend and processors.\n";
-               helpString += "The ffastq and rfastq parameter is required.\n";
-               helpString += "The align parameter allows you to specify the alignment method to use.  Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n";
+               helpString += "The ffastq and rfastq parameters are required.\n";
+               helpString += "The align parameter allows you to specify the alignment method to use.  Your options are: gotoh and needleman. The default is needleman.\n";
                helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
                helpString += "The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n";
                helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
                helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n";
+        helpString += "The threshold parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=40.\n";
         helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
         helpString += "The make.contigs command should be in the following format: \n";
                helpString += "make.contigs(ffastq=yourForwardFastqFile, rfastq=yourReverseFastqFile, align=yourAlignmentMethod) \n";
@@ -54,7 +56,28 @@ string MakeContigsCommand::getHelpString(){
                exit(1);
        }
 }
-
+//**********************************************************************************************************************
+string MakeContigsCommand::getOutputFileNameTag(string type, string inputName=""){     
+       try {
+        string outputFileName = "";
+               map<string, vector<string> >::iterator it;
+        
+        //is this a type this command creates
+        it = outputTypes.find(type);
+        if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
+        else {
+            if (type == "fasta")             {   outputFileName =  "contigs.fasta";         }
+            else if (type == "qfile")        {   outputFileName =  "contigs.qual";          }
+            else if (type == "mismatch")     {   outputFileName =  "contigs.mismatch";      }
+            else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
+        }
+        return outputFileName;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeContigsCommand", "getOutputFileNameTag");
+               exit(1);
+       }
+}
 //**********************************************************************************************************************
 MakeContigsCommand::MakeContigsCommand(){      
        try {
@@ -152,6 +175,10 @@ MakeContigsCommand::MakeContigsCommand(string option)  {
                        m->mothurConvert(temp, gapExtend); 
             if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; }
                        
+            temp = validParameter.validFile(parameters, "threshold", false);   if (temp == "not found"){       temp = "40";                    }
+                       m->mothurConvert(temp, threshold); 
+            if ((threshold < 0) || (threshold > 40)) { m->mothurOut("[ERROR]: threshold must be between 0 and 40.\n"); abort=true; }
+
                        temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
                        m->setProcessors(temp);
                        m->mothurConvert(temp, processors);
@@ -183,9 +210,9 @@ int MakeContigsCommand::execute(){
     
         if (m->control_pressed) { return 0; }
         
-        string outFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "contigs.fasta";
-        string outQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "contigs.qual";
-        string outMisMatchFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "contigs.mismatches";
+        string outFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("fasta");
+        string outQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("qfile");
+        string outMisMatchFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("mismatch");
         outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile);
         outputNames.push_back(outQualFile); outputTypes["qfile"].push_back(outQualFile);
         outputNames.push_back(outMisMatchFile); outputTypes["mismatch"].push_back(outMisMatchFile);
@@ -291,7 +318,7 @@ int MakeContigsCommand::createProcesses(vector< vector<string> > files, string o
                for( int i=0; i<processors-1; i++ ){
                        string extension = toString(i) + ".temp";
                        
-                       contigsData* tempcontig = new contigsData(files[i], (outputFasta + extension), (outputQual + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, i);
+                       contigsData* tempcontig = new contigsData(files[i], (outputFasta + extension), (outputQual + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, threshold, i);
                        pDataArray.push_back(tempcontig);
                        processIDS.push_back(i);
             
@@ -386,25 +413,52 @@ int MakeContigsCommand::driver(vector<string> files, string outputFasta, string
             int numMismatches = 0;
             string seq1 = fSeq.getAligned();
             string seq2 = rSeq.getAligned();
-           
             vector<int> scores1 = fQual.getQualityScores();
             vector<int> scores2 = rQual.getQualityScores();
-            
-            for (int i = 0; i < length; i++) {
-                if (seq1[i] == seq2[i]) { //match, add base and choose highest score
+
+           // if (num < 5) {  cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; }
+            int overlapStart = fSeq.getStartPos();
+            int seq2Start = rSeq.getStartPos();
+            //bigger of the 2 starting positions is the location of the overlapping start
+            if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
+                overlapStart = seq2Start; 
+                for (int i = 0; i < overlapStart; i++) {
                     contig += seq1[i];
                     contigScores.push_back(scores1[ABaseMap[i]]);
-                    if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; }
-                }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2
+                }
+            }else { //seq1 starts later so take from 0 to overlapStart from seq2
+                for (int i = 0; i < overlapStart; i++) {
                     contig += seq2[i];
                     contigScores.push_back(scores2[BBaseMap[i]]);
-                }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1
+                }
+            }
+            
+            int seq1End = fSeq.getEndPos();
+            int seq2End = rSeq.getEndPos();
+            int overlapEnd = seq1End;
+            if (seq2End < overlapEnd) { overlapEnd = seq2End; }  //smallest end position is where overlapping ends
+            
+            for (int i = overlapStart; i < overlapEnd; i++) {
+                if (seq1[i] == seq2[i]) { //match, add base and choose highest score
                     contig += seq1[i];
                     contigScores.push_back(scores1[ABaseMap[i]]);
+                    if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; }
+                }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base
+                    if (scores2[BBaseMap[i]] < threshold) { } //
+                    else {
+                        contig += seq2[i];
+                        contigScores.push_back(scores2[BBaseMap[i]]);
+                    }
+                }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base
+                    if (scores1[ABaseMap[i]] < threshold) { } //
+                    else {
+                        contig += seq1[i];
+                        contigScores.push_back(scores1[ABaseMap[i]]);
+                    }
                 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
                     char c = seq1[i];
                     contigScores.push_back(scores1[ABaseMap[i]]);
-                    if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; c = seq2[i]; }
+                    if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; }
                     contig += c;
                     numMismatches++;
                 }else { //should never get here
@@ -412,6 +466,19 @@ int MakeContigsCommand::driver(vector<string> files, string outputFasta, string
                 }
             }
             
+            if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
+                for (int i = overlapEnd; i < length; i++) {
+                    contig += seq2[i];
+                    contigScores.push_back(scores2[BBaseMap[i]]);
+                }
+            }else { //seq2 ends before seq1 so take from overlap to length from seq1
+                for (int i = overlapEnd; i < length; i++) {
+                    contig += seq1[i];
+                    contigScores.push_back(scores1[ABaseMap[i]]);
+                }
+
+            }
+            //if (num < 5) { cout << overlapStart << '\t' << overlapEnd << endl << seq1 << endl << seq2 << endl<< contig << endl; }
             //output
             outFasta << ">" << fSeq.getName() << endl << contig << endl;
             outQual << ">" << fSeq.getName() << endl;