X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=6f5bb979a4d45b9fc3d8201848aa9f6b2d2dddb7;hb=ee8403d4eb5760187d62b42a9cf4272de8fc0ec4;hp=9913727dcdc8257816e1f83fab2f8a4b903ff1cd;hpb=fc3b1fc4fc1c4e38fde6b0c0ee7896b5fe0b9d57;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 9913727..6f5bb97 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -329,6 +329,8 @@ int TrimSeqsCommand::execute(){ numFPrimers = 0; //this needs to be initialized numRPrimers = 0; + numSpacers = 0; + numLinkers = 0; createGroup = false; vector > fastaFileNames; vector > qualFileNames; @@ -435,6 +437,17 @@ int TrimSeqsCommand::execute(){ Sequence currSeq(in); m->gobble(in); out << currSeq.getName() << '\t' << it->second << endl; + + if (nameFile != "") { + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + vector thisSeqsNames; + m->splitAtChar(itName->second, thisSeqsNames, ','); + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + out << thisSeqsNames[k] << '\t' << it->second << endl; + } + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } } in.close(); out.close(); @@ -549,7 +562,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int count = 0; bool moreSeqs = 1; - TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); + TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer); while (moreSeqs) { @@ -594,6 +607,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } + + if(rbarcodes.size() != 0){ + success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex); + if(success > bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } if(numSpacers != 0){ success = trimOligos.stripSpacer(currSeq, currQual); @@ -668,6 +687,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string currQual.printQScores(trimQualFile); } + if(nameFile != ""){ map::iterator itName = nameMap.find(currSeq.getName()); if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } @@ -689,11 +709,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + int numRedundants = 0; if (nameFile != "") { map::iterator itName = nameMap.find(currSeq.getName()); if (itName != nameMap.end()) { vector thisSeqsNames; m->splitAtChar(itName->second, thisSeqsNames, ','); + numRedundants = thisSeqsNames.size()-1; //we already include ourselves below for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl; } @@ -701,8 +723,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } map::iterator it = groupCounts.find(thisGroup); - if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; } - else { groupCounts[it->first]++; } + if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; } + else { groupCounts[it->first] += (1 + numRedundants); } } } @@ -746,7 +768,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string count++; } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= line.end)) { break; } @@ -786,7 +808,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName int exitCommand = 1; processIDS.clear(); -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //loop through and create all the processes you want while (process != processors) { int pid = fork(); @@ -933,7 +955,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName tempPrimerQualFileNames, tempNameFileNames, lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m, - pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, + pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer, primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast, qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage, minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap); @@ -1027,7 +1049,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName } } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(createGroup){ ifstream in; string tempFile = filename + toString(processIDS[i]) + ".num.temp"; @@ -1067,7 +1089,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) { vector fastaFilePos; vector qfileFilePos; - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //set file positions for fasta file fastaFilePos = m->divideFile(filename, processors); @@ -1157,6 +1179,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) { }else{ int numFastaSeqs = 0; fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs); + if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); } if (qfilename != "") { int numQualSeqs = 0; @@ -1173,12 +1196,10 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) { int startIndex = i * numSeqsPerProcessor; if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor)); - cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl; if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); } } - - if(qfilename == "") { qLines = lines; } //files with duds } + if(qfilename == "") { qLines = lines; } //files with duds return 1; #endif @@ -1242,13 +1263,36 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< primerNameVector.push_back(group); } else if(type == "REVERSE"){ - Sequence oligoRC("reverse", oligo); - oligoRC.reverseComplement(); - revPrimer.push_back(oligoRC.getUnaligned()); + //Sequence oligoRC("reverse", oligo); + //oligoRC.reverseComplement(); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); } else if(type == "BARCODE"){ inOligos >> group; + + //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs + //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info + string temp = ""; + while (!inOligos.eof()) { + char c = inOligos.get(); + if (c == 10 || c == 13){ break; } + else if (c == 32 || c == 9){;} //space or tab + else { temp += c; } + } + //then this is illumina data with 4 columns + if (temp != "") { + string reverseBarcode = reverseOligo(group); //reverse barcode + group = temp; + + //check for repeat barcodes + map::iterator itBar = rbarcodes.find(reverseBarcode); + if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); } + + rbarcodes[reverseBarcode]=indexBarcode; + } + //check for repeat barcodes map::iterator itBar = barcodes.find(oligo); if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } @@ -1467,6 +1511,46 @@ bool TrimSeqsCommand::cullHomoP(Sequence& seq){ } } +//********************************************************************/ +string TrimSeqsCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "reverseOligo"); + exit(1); + } +} //***************************************************************************************************************