X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=makecontigscommand.cpp;h=691d706ab00efa8823412753b72f08e5198873b1;hb=6c2b1e530a5c0bb87040e58a3e410097acdfcc3d;hp=560b39cdd6adfaec6fab5bafc8e30175038c0b9c;hpb=ccf2fedbb600a362777a11501bb56c9c7746068e;p=mothur.git diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp index 560b39c..691d706 100644 --- a/makecontigscommand.cpp +++ b/makecontigscommand.cpp @@ -18,6 +18,7 @@ vector MakeContigsCommand::setParameters(){ CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch); CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen); CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend); + CommandParameter pthreshold("threshold", "Number", "", "40", "", "", "",false,false); parameters.push_back(pthreshold); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -37,12 +38,13 @@ string MakeContigsCommand::getHelpString(){ string helpString = ""; helpString += "The make.contigs command reads a forward fastq file and a reverse fastq file and outputs new fasta and quality files.\n"; helpString += "The make.contigs command parameters are ffastq, rfastq, align, match, mismatch, gapopen, gapextend and processors.\n"; - helpString += "The ffastq and rfastq parameter is required.\n"; - helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n"; + helpString += "The ffastq and rfastq parameters are required.\n"; + helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n"; helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"; helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"; helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"; helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"; + helpString += "The threshold parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=40.\n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; helpString += "The make.contigs command should be in the following format: \n"; helpString += "make.contigs(ffastq=yourForwardFastqFile, rfastq=yourReverseFastqFile, align=yourAlignmentMethod) \n"; @@ -54,7 +56,28 @@ string MakeContigsCommand::getHelpString(){ exit(1); } } - +//********************************************************************************************************************** +string MakeContigsCommand::getOutputFileNameTag(string type, string inputName=""){ + try { + string outputFileName = ""; + map >::iterator it; + + //is this a type this command creates + it = outputTypes.find(type); + if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } + else { + if (type == "fasta") { outputFileName = "contigs.fasta"; } + else if (type == "qfile") { outputFileName = "contigs.qual"; } + else if (type == "mismatch") { outputFileName = "contigs.mismatch"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } + } + return outputFileName; + } + catch(exception& e) { + m->errorOut(e, "MakeContigsCommand", "getOutputFileNameTag"); + exit(1); + } +} //********************************************************************************************************************** MakeContigsCommand::MakeContigsCommand(){ try { @@ -152,6 +175,10 @@ MakeContigsCommand::MakeContigsCommand(string option) { m->mothurConvert(temp, gapExtend); if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; } + temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "40"; } + m->mothurConvert(temp, threshold); + if ((threshold < 0) || (threshold > 40)) { m->mothurOut("[ERROR]: threshold must be between 0 and 40.\n"); abort=true; } + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); @@ -183,9 +210,9 @@ int MakeContigsCommand::execute(){ if (m->control_pressed) { return 0; } - string outFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "contigs.fasta"; - string outQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "contigs.qual"; - string outMisMatchFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "contigs.mismatches"; + string outFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("fasta"); + string outQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("qfile"); + string outMisMatchFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("mismatch"); outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); outputNames.push_back(outQualFile); outputTypes["qfile"].push_back(outQualFile); outputNames.push_back(outMisMatchFile); outputTypes["mismatch"].push_back(outMisMatchFile); @@ -291,7 +318,7 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o for( int i=0; i files, string outputFasta, string int numMismatches = 0; string seq1 = fSeq.getAligned(); string seq2 = rSeq.getAligned(); - vector scores1 = fQual.getQualityScores(); vector scores2 = rQual.getQualityScores(); - - for (int i = 0; i < length; i++) { - if (seq1[i] == seq2[i]) { //match, add base and choose highest score + + // if (num < 5) { cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; } + int overlapStart = fSeq.getStartPos(); + int seq2Start = rSeq.getStartPos(); + //bigger of the 2 starting positions is the location of the overlapping start + if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1 + overlapStart = seq2Start; + for (int i = 0; i < overlapStart; i++) { contig += seq1[i]; contigScores.push_back(scores1[ABaseMap[i]]); - if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; } - }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2 + } + }else { //seq1 starts later so take from 0 to overlapStart from seq2 + for (int i = 0; i < overlapStart; i++) { contig += seq2[i]; contigScores.push_back(scores2[BBaseMap[i]]); - }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1 + } + } + + int seq1End = fSeq.getEndPos(); + int seq2End = rSeq.getEndPos(); + int overlapEnd = seq1End; + if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends + + for (int i = overlapStart; i < overlapEnd; i++) { + if (seq1[i] == seq2[i]) { //match, add base and choose highest score contig += seq1[i]; contigScores.push_back(scores1[ABaseMap[i]]); + if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; } + }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base + if (scores2[BBaseMap[i]] < threshold) { } // + else { + contig += seq2[i]; + contigScores.push_back(scores2[BBaseMap[i]]); + } + }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base + if (scores1[ABaseMap[i]] < threshold) { } // + else { + contig += seq1[i]; + contigScores.push_back(scores1[ABaseMap[i]]); + } }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality char c = seq1[i]; contigScores.push_back(scores1[ABaseMap[i]]); - if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; c = seq2[i]; } + if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; } contig += c; numMismatches++; }else { //should never get here @@ -412,6 +466,19 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string } } + if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2 + for (int i = overlapEnd; i < length; i++) { + contig += seq2[i]; + contigScores.push_back(scores2[BBaseMap[i]]); + } + }else { //seq2 ends before seq1 so take from overlap to length from seq1 + for (int i = overlapEnd; i < length; i++) { + contig += seq1[i]; + contigScores.push_back(scores1[ABaseMap[i]]); + } + + } + //if (num < 5) { cout << overlapStart << '\t' << overlapEnd << endl << seq1 << endl << seq2 << endl<< contig << endl; } //output outFasta << ">" << fSeq.getName() << endl << contig << endl; outQual << ">" << fSeq.getName() << endl;