X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=6a8afd017c1e88ff1bdbde883608b5eff00b235f;hb=49d2b7459c5027557564b21e9487dadafbbbdc96;hp=b0fa7a0a2e2f1dbe733207686ec5ed8647ba888e;hpb=43ed0accfbc2852849e104ff7eccdd2c42acd4ec;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index b0fa7a0..6a8afd0 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -97,6 +97,29 @@ string TrimSeqsCommand::getHelpString(){ exit(1); } } +//********************************************************************************************************************** +string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ + try { + string outputFileName = ""; + map >::iterator it; + + //is this a type this command creates + it = outputTypes.find(type); + if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } + else { + if (type == "qfile") { outputFileName = "qual"; } + else if (type == "fasta") { outputFileName = "fasta"; } + else if (type == "group") { outputFileName = "groups"; } + else if (type == "name") { outputFileName = "names"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } + } + return outputFileName; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getOutputFileNameTag"); + exit(1); + } +} //********************************************************************************************************************** @@ -329,19 +352,21 @@ int TrimSeqsCommand::execute(){ numFPrimers = 0; //this needs to be initialized numRPrimers = 0; + numSpacers = 0; + numLinkers = 0; createGroup = false; vector > fastaFileNames; vector > qualFileNames; vector > nameFileNames; - string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta"; + string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("fasta"); outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); - string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta"; + string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("fasta"); outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile); - string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual"; - string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual"; + string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("qfile"); + string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("qfile"); if (qFileName != "") { outputNames.push_back(trimQualFile); @@ -350,8 +375,8 @@ int TrimSeqsCommand::execute(){ outputTypes["qfile"].push_back(scrapQualFile); } - string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names"; - string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names"; + string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim." + getOutputFileNameTag("name"); + string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap." + getOutputFileNameTag("name"); if (nameFile != "") { m->readNames(nameFile, nameMap); @@ -367,7 +392,7 @@ int TrimSeqsCommand::execute(){ if(oligoFile != ""){ createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames); if (createGroup) { - outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; + outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group"); outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); } } @@ -426,7 +451,7 @@ int TrimSeqsCommand::execute(){ m->openInputFile(it->first, in); ofstream out; - string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups"; + string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + getOutputFileNameTag("group"); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); m->openOutputFile(thisGroupName, out); @@ -435,6 +460,17 @@ int TrimSeqsCommand::execute(){ Sequence currSeq(in); m->gobble(in); out << currSeq.getName() << '\t' << it->second << endl; + + if (nameFile != "") { + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + vector thisSeqsNames; + m->splitAtChar(itName->second, thisSeqsNames, ','); + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + out << thisSeqsNames[k] << '\t' << it->second << endl; + } + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } } in.close(); out.close(); @@ -549,7 +585,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int count = 0; bool moreSeqs = 1; - TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); + TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer); while (moreSeqs) { @@ -594,6 +630,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } + + if(rbarcodes.size() != 0){ + success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex); + if(success > bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } if(numSpacers != 0){ success = trimOligos.stripSpacer(currSeq, currQual); @@ -668,6 +710,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string currQual.printQScores(trimQualFile); } + if(nameFile != ""){ map::iterator itName = nameMap.find(currSeq.getName()); if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } @@ -689,11 +732,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + int numRedundants = 0; if (nameFile != "") { map::iterator itName = nameMap.find(currSeq.getName()); if (itName != nameMap.end()) { vector thisSeqsNames; m->splitAtChar(itName->second, thisSeqsNames, ','); + numRedundants = thisSeqsNames.size()-1; //we already include ourselves below for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl; } @@ -701,8 +746,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } map::iterator it = groupCounts.find(thisGroup); - if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; } - else { groupCounts[it->first]++; } + if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; } + else { groupCounts[it->first] += (1 + numRedundants); } } } @@ -933,7 +978,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName tempPrimerQualFileNames, tempNameFileNames, lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m, - pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, + pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer, primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast, qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage, minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap); @@ -1174,12 +1219,10 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) { int startIndex = i * numSeqsPerProcessor; if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor)); - cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl; if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); } } - - if(qfilename == "") { qLines = lines; } //files with duds } + if(qfilename == "") { qLines = lines; } //files with duds return 1; #endif @@ -1243,13 +1286,36 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< primerNameVector.push_back(group); } else if(type == "REVERSE"){ - Sequence oligoRC("reverse", oligo); - oligoRC.reverseComplement(); - revPrimer.push_back(oligoRC.getUnaligned()); + //Sequence oligoRC("reverse", oligo); + //oligoRC.reverseComplement(); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); } else if(type == "BARCODE"){ inOligos >> group; + + //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs + //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info + string temp = ""; + while (!inOligos.eof()) { + char c = inOligos.get(); + if (c == 10 || c == 13){ break; } + else if (c == 32 || c == 9){;} //space or tab + else { temp += c; } + } + //then this is illumina data with 4 columns + if (temp != "") { + string reverseBarcode = reverseOligo(group); //reverse barcode + group = temp; + + //check for repeat barcodes + map::iterator itBar = rbarcodes.find(reverseBarcode); + if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); } + + rbarcodes[reverseBarcode]=indexBarcode; + } + //check for repeat barcodes map::iterator itBar = barcodes.find(oligo); if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } @@ -1468,6 +1534,46 @@ bool TrimSeqsCommand::cullHomoP(Sequence& seq){ } } +//********************************************************************/ +string TrimSeqsCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "reverseOligo"); + exit(1); + } +} //***************************************************************************************************************