CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen);
CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend);
+ CommandParameter pthreshold("threshold", "Number", "", "40", "", "", "",false,false); parameters.push_back(pthreshold);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n";
helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n";
+ helpString += "The threshold parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=40.\n";
helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
helpString += "The make.contigs command should be in the following format: \n";
helpString += "make.contigs(ffastq=yourForwardFastqFile, rfastq=yourReverseFastqFile, align=yourAlignmentMethod) \n";
exit(1);
}
}
-
+//**********************************************************************************************************************
+string MakeContigsCommand::getOutputFileNameTag(string type, string inputName=""){
+ try {
+ string outputFileName = "";
+ map<string, vector<string> >::iterator it;
+
+ //is this a type this command creates
+ it = outputTypes.find(type);
+ if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
+ else {
+ if (type == "fasta") { outputFileName = "contigs.fasta"; }
+ else if (type == "qfile") { outputFileName = "contigs.qual"; }
+ else if (type == "mismatch") { outputFileName = "contigs.mismatch"; }
+ else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
+ }
+ return outputFileName;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeContigsCommand", "getOutputFileNameTag");
+ exit(1);
+ }
+}
//**********************************************************************************************************************
MakeContigsCommand::MakeContigsCommand(){
try {
m->mothurConvert(temp, gapExtend);
if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; }
+ temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "40"; }
+ m->mothurConvert(temp, threshold);
+ if ((threshold < 0) || (threshold > 40)) { m->mothurOut("[ERROR]: threshold must be between 0 and 40.\n"); abort=true; }
+
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
m->mothurConvert(temp, processors);
if (m->control_pressed) { return 0; }
- string outFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "contigs.fasta";
- string outQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "contigs.qual";
- string outMisMatchFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "contigs.mismatches";
+ string outFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("fasta");
+ string outQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("qfile");
+ string outMisMatchFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("mismatches");
outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile);
outputNames.push_back(outQualFile); outputTypes["qfile"].push_back(outQualFile);
outputNames.push_back(outMisMatchFile); outputTypes["mismatch"].push_back(outMisMatchFile);
for( int i=0; i<processors-1; i++ ){
string extension = toString(i) + ".temp";
- contigsData* tempcontig = new contigsData(files[i], (outputFasta + extension), (outputQual + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, i);
+ contigsData* tempcontig = new contigsData(files[i], (outputFasta + extension), (outputQual + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, threshold, i);
pDataArray.push_back(tempcontig);
processIDS.push_back(i);
int numMismatches = 0;
string seq1 = fSeq.getAligned();
string seq2 = rSeq.getAligned();
-
vector<int> scores1 = fQual.getQualityScores();
vector<int> scores2 = rQual.getQualityScores();
-
- for (int i = 0; i < length; i++) {
- if (seq1[i] == seq2[i]) { //match, add base and choose highest score
+
+ // if (num < 5) { cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; }
+ int overlapStart = fSeq.getStartPos();
+ int seq2Start = rSeq.getStartPos();
+ //bigger of the 2 starting positions is the location of the overlapping start
+ if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
+ overlapStart = seq2Start;
+ for (int i = 0; i < overlapStart; i++) {
contig += seq1[i];
contigScores.push_back(scores1[ABaseMap[i]]);
- if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; }
- }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2
+ }
+ }else { //seq1 starts later so take from 0 to overlapStart from seq2
+ for (int i = 0; i < overlapStart; i++) {
contig += seq2[i];
contigScores.push_back(scores2[BBaseMap[i]]);
- }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1
+ }
+ }
+
+ int seq1End = fSeq.getEndPos();
+ int seq2End = rSeq.getEndPos();
+ int overlapEnd = seq1End;
+ if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends
+
+ for (int i = overlapStart; i < overlapEnd; i++) {
+ if (seq1[i] == seq2[i]) { //match, add base and choose highest score
contig += seq1[i];
contigScores.push_back(scores1[ABaseMap[i]]);
+ if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; }
+ }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base
+ if (scores2[BBaseMap[i]] < threshold) { } //
+ else {
+ contig += seq2[i];
+ contigScores.push_back(scores2[BBaseMap[i]]);
+ }
+ }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base
+ if (scores1[ABaseMap[i]] < threshold) { } //
+ else {
+ contig += seq1[i];
+ contigScores.push_back(scores1[ABaseMap[i]]);
+ }
}else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
char c = seq1[i];
contigScores.push_back(scores1[ABaseMap[i]]);
- if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; c = seq2[i]; }
+ if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; }
contig += c;
numMismatches++;
}else { //should never get here
}
}
+ if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
+ for (int i = overlapEnd; i < length; i++) {
+ contig += seq2[i];
+ contigScores.push_back(scores2[BBaseMap[i]]);
+ }
+ }else { //seq2 ends before seq1 so take from overlap to length from seq1
+ for (int i = overlapEnd; i < length; i++) {
+ contig += seq1[i];
+ contigScores.push_back(scores1[ABaseMap[i]]);
+ }
+
+ }
+ //if (num < 5) { cout << overlapStart << '\t' << overlapEnd << endl << seq1 << endl << seq2 << endl<< contig << endl; }
//output
outFasta << ">" << fSeq.getName() << endl << contig << endl;
outQual << ">" << fSeq.getName() << endl;