vector<string> setParameters();
string getCommandName() { return "trim.seqs"; }
string getCommandCategory() { return "Sequence Processing"; }
- string getOutputFileNameTag(string, string);
+
string getHelpString();
+ string getOutputPattern(string);
string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
bool abort, createGroup;
string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
- bool flip, allFiles, qtrim, keepforward;
+ bool flip, allFiles, qtrim, keepforward, pairedOligos, reorient;
int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
int qWindowSize, qWindowStep, keepFirst, removeLast;
double qRollAverage, qThreshold, qWindowAverage, qAverage;
vector<string> revPrimer, outputNames;
set<string> filesToRemove;
+ map<int, oligosPair> pairedBarcodes;
+ map<int, oligosPair> pairedPrimers;
map<string, int> barcodes;
vector<string> groupVector;
map<string, int> primers;
vector<vector<string> > qualFileNames;
vector<vector<string> > nameFileNames;
unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
- bool flip, allFiles, qtrim, keepforward, createGroup;
+ bool flip, allFiles, qtrim, keepforward, createGroup, pairedOligos, reorient;
int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
int qWindowSize, qWindowStep, keepFirst, removeLast, count;
double qRollAverage, qThreshold, qWindowAverage, qAverage;
map<string, int> groupCounts;
map<string, string> nameMap;
map<string, string> groupMap;
+ map<int, oligosPair> pairedBarcodes;
+ map<int, oligosPair> pairedPrimers;
trimData(){}
trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
- int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
+ int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, map<int, oligosPair> pbr, map<int, oligosPair> ppr, bool po,
vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
- int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm, map<string, int> ncount) {
+ int minL, int maxA, int maxH, int maxL, bool fli, bool reo, map<string, string> nm, map<string, int> ncount) {
filename = fn;
qFileName = qn;
nameFile = nf;
sdiffs = sd;
tdiffs = td;
barcodes = bar;
+ pairedPrimers = ppr;
+ pairedBarcodes = pbr;
+ pairedOligos = po;
primers = pri; numFPrimers = primers.size();
revPrimer = revP; numRPrimers = revPrimer.size();
linker = li; numLinkers = linker.size();
maxHomoP = maxH;
maxLength = maxL;
flip = fli;
+ reorient = reo;
nameMap = nm;
count = 0;
}
}
}
-
- TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
+ TrimOligos* trimOligos = NULL;
+ int numBarcodes = pDataArray->barcodes.size();
+ if (pDataArray->pairedOligos) { trimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->pairedPrimers, pDataArray->pairedBarcodes); numBarcodes = pDataArray->pairedBarcodes.size(); pDataArray->numFPrimers = pDataArray->pairedPrimers.size(); }
+ else { trimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); }
+
+ TrimOligos* rtrimOligos = NULL;
+ if (pDataArray->reorient) {
+ //create reoriented primer and barcode pairs
+ map<int, oligosPair> rpairedPrimers, rpairedBarcodes;
+ for (map<int, oligosPair>::iterator it = pDataArray->pairedPrimers.begin(); it != pDataArray->pairedPrimers.end(); it++) {
+ oligosPair tempPair(trimOligos->reverseOligo((it->second).reverse), (trimOligos->reverseOligo((it->second).forward))); //reversePrimer, rc ForwardPrimer
+ rpairedPrimers[it->first] = tempPair;
+ }
+ for (map<int, oligosPair>::iterator it = pDataArray->pairedBarcodes.begin(); it != pDataArray->pairedBarcodes.end(); it++) {
+ oligosPair tempPair(trimOligos->reverseOligo((it->second).reverse), (trimOligos->reverseOligo((it->second).forward))); //reverseBarcode, rc ForwardBarcode
+ rpairedBarcodes[it->first] = tempPair;
+ }
+ rtrimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, rpairedPrimers, rpairedBarcodes); numBarcodes = rpairedBarcodes.size();
+ }
- pDataArray->count = pDataArray->lineEnd;
+ pDataArray->count = 0;
for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
- if (pDataArray->m->control_pressed) {
+ if (pDataArray->m->control_pressed) {
+ delete trimOligos; if (pDataArray->reorient) { delete rtrimOligos; }
inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); }
if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
int currentSeqsDiffs = 0;
Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
+ Sequence savedSeq(currSeq.getName(), currSeq.getAligned());
- QualityScores currQual;
+ QualityScores currQual; QualityScores savedQual;
if(pDataArray->qFileName != ""){
currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
+ savedQual.setName(currQual.getName()); savedQual.setScores(currQual.getScores());
}
+
string origSeq = currSeq.getUnaligned();
if (origSeq != "") {
+ pDataArray->count++;
int barcodeIndex = 0;
int primerIndex = 0;
if(pDataArray->numLinkers != 0){
- success = trimOligos.stripLinker(currSeq, currQual);
+ success = trimOligos->stripLinker(currSeq, currQual);
if(success > pDataArray->ldiffs) { trashCode += 'k'; }
else{ currentSeqsDiffs += success; }
}
- if(pDataArray->barcodes.size() != 0){
- success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
+ if(numBarcodes != 0){
+ success = trimOligos->stripBarcode(currSeq, currQual, barcodeIndex);
if(success > pDataArray->bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
if(pDataArray->numSpacers != 0){
- success = trimOligos.stripSpacer(currSeq, currQual);
+ success = trimOligos->stripSpacer(currSeq, currQual);
if(success > pDataArray->sdiffs) { trashCode += 's'; }
else{ currentSeqsDiffs += success; }
}
if(pDataArray->numFPrimers != 0){
- success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
+ success = trimOligos->stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
if(success > pDataArray->pdiffs) { trashCode += 'f'; }
else{ currentSeqsDiffs += success; }
}
if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
if(pDataArray->numRPrimers != 0){
- success = trimOligos.stripReverse(currSeq, currQual);
+ success = trimOligos->stripReverse(currSeq, currQual);
if(!success) { trashCode += 'r'; }
}
+ if (pDataArray->reorient && (trashCode != "")) { //if you failed and want to check the reverse
+ int thisSuccess = 0;
+ string thisTrashCode = "";
+ int thisCurrentSeqsDiffs = 0;
+
+ int thisBarcodeIndex = 0;
+ int thisPrimerIndex = 0;
+
+ if(numBarcodes != 0){
+ thisSuccess = rtrimOligos->stripBarcode(savedSeq, savedQual, thisBarcodeIndex);
+ if(thisSuccess > pDataArray->bdiffs) { thisTrashCode += 'b'; }
+ else{ thisCurrentSeqsDiffs += thisSuccess; }
+ }
+
+ if(pDataArray->numFPrimers != 0){
+ thisSuccess = rtrimOligos->stripForward(savedSeq, savedQual, thisPrimerIndex, pDataArray->keepforward);
+ if(thisSuccess > pDataArray->pdiffs) { thisTrashCode += 'f'; }
+ else{ thisCurrentSeqsDiffs += thisSuccess; }
+ }
+
+ if (thisCurrentSeqsDiffs > pDataArray->tdiffs) { thisTrashCode += 't'; }
+
+ if (thisTrashCode == "") {
+ trashCode = thisTrashCode;
+ success = thisSuccess;
+ currentSeqsDiffs = thisCurrentSeqsDiffs;
+ barcodeIndex = thisBarcodeIndex;
+ primerIndex = thisPrimerIndex;
+ savedSeq.reverseComplement();
+ currSeq.setAligned(savedSeq.getAligned());
+ if(pDataArray->qFileName != ""){
+ savedQual.flipQScores();
+ currQual.setScores(savedQual.getScores());
+ }
+ }else { trashCode += "(" + thisTrashCode + ")"; }
+ }
+
+
if(pDataArray->keepFirst != 0){
//success = keepFirstTrim(currSeq, currQual);
success = 1;
}
if(trashCode.length() == 0){
- currSeq.setAligned(currSeq.getUnaligned());
- currSeq.printSequence(trimFASTAFile);
-
- if(pDataArray->qFileName != ""){
- currQual.printQScores(trimQualFile);
- }
-
- if(pDataArray->nameFile != ""){
- map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
- if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
- else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
- }
-
- int numRedundants = 0;
- if (pDataArray->countfile != "") {
- map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
- if (itCount != pDataArray->nameCount.end()) {
- trimCountFile << itCount->first << '\t' << itCount->second << endl;
- numRedundants = itCount->second-1;
- }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
- }
-
- if (pDataArray->createGroup) {
- if(pDataArray->barcodes.size() != 0){
- string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
- if (pDataArray->primers.size() != 0) {
+ string thisGroup = "";
+ if (pDataArray->createGroup) {
+ if(numBarcodes != 0){
+ thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
+ if (pDataArray->numFPrimers != 0) {
if (pDataArray->primerNameVector[primerIndex] != "") {
if(thisGroup != "") {
thisGroup += "." + pDataArray->primerNameVector[primerIndex];
}
}
}
-
- if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
- else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
-
- if (pDataArray->nameFile != "") {
- map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
- if (itName != pDataArray->nameMap.end()) {
- vector<string> thisSeqsNames;
- pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
- numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
- for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
- outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
- }
- }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
- }
-
- map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
- if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
- else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
+ }
+ }
+
+ int pos = thisGroup.find("ignore");
+ if (pos == string::npos) {
+
+ currSeq.setAligned(currSeq.getUnaligned());
+ currSeq.printSequence(trimFASTAFile);
+
+ if(pDataArray->qFileName != ""){
+ currQual.printQScores(trimQualFile);
+ }
+
+ if(pDataArray->nameFile != ""){
+ map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+ if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
+ else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+ }
+
+ int numRedundants = 0;
+ if (pDataArray->countfile != "") {
+ map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
+ if (itCount != pDataArray->nameCount.end()) {
+ trimCountFile << itCount->first << '\t' << itCount->second << endl;
+ numRedundants = itCount->second-1;
+ }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
+ }
+
+ if (pDataArray->createGroup) {
+ if(numBarcodes != 0){
+
+ if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
+ else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
+
+ if (pDataArray->nameFile != "") {
+ map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+ if (itName != pDataArray->nameMap.end()) {
+ vector<string> thisSeqsNames;
+ pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
+ numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
+ for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+ outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
+ }
+ }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+ }
+
+ map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
+ if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
+ else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
+
+ }
+ }
+
+ if(pDataArray->allFiles){
+ ofstream output;
+ pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
+ currSeq.printSequence(output);
+ output.close();
- }
- }
-
- if(pDataArray->allFiles){
- ofstream output;
- pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
- currSeq.printSequence(output);
- output.close();
-
- if(pDataArray->qFileName != ""){
- pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
- currQual.printQScores(output);
- output.close();
- }
-
- if(pDataArray->nameFile != ""){
- map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
- if (itName != pDataArray->nameMap.end()) {
- pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
- output << itName->first << '\t' << itName->second << endl;
- output.close();
- }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
- }
- }
+ if(pDataArray->qFileName != ""){
+ pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
+ currQual.printQScores(output);
+ output.close();
+ }
+
+ if(pDataArray->nameFile != ""){
+ map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+ if (itName != pDataArray->nameMap.end()) {
+ pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
+ output << itName->first << '\t' << itName->second << endl;
+ output.close();
+ }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+ }
+ }
+ }
}
else{
if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
}
//report progress
- if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
+ if((pDataArray->count) % 1000 == 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
}
//report progress
if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
-
+ if (pDataArray->reorient) { delete rtrimOligos; }
+ delete trimOligos;
inFASTA.close();
trimFASTAFile.close();
scrapFASTAFile.close();