#include "command.hpp"
#include "sequence.hpp"
#include "qualityscores.h"
-#include "groupmap.h"
#include "trimoligos.h"
+#include "counttable.h"
class TrimSeqsCommand : public Command {
vector<string> setParameters();
string getCommandName() { return "trim.seqs"; }
string getCommandCategory() { return "Sequence Processing"; }
+ string getOutputFileNameTag(string, string);
string getHelpString();
string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
void help() { m->mothurOut(getHelpString()); }
private:
-
- GroupMap* groupMap;
-
struct linePair {
unsigned long long start;
unsigned long long end;
linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
linePair() {}
};
-
+
bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
bool keepFirstTrim(Sequence&, QualityScores&);
bool removeLastTrim(Sequence&, QualityScores&);
string reverseOligo(string);
bool abort, createGroup;
- string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
+ string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
bool flip, allFiles, qtrim, keepforward;
int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
vector<string> barcodeNameVector; //needed here?
map<string, int> groupCounts;
map<string, string> nameMap;
+ map<string, int> nameCount; //for countfile name -> repCount
+ map<string, string> groupMap; //for countfile name -> group
vector<int> processIDS; //processid
vector<linePair> lines;
vector<linePair> qLines;
- int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
- int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
+ int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
+ int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
int setLines(string, string);
};
struct trimData {
unsigned long long start, end;
MothurOut* m;
- string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
+ string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
vector<vector<string> > fastaFileNames;
vector<vector<string> > qualFileNames;
vector<vector<string> > nameFileNames;
vector<string> revPrimer;
map<string, int> barcodes;
map<string, int> primers;
+ map<string, int> nameCount;
vector<string> linker;
vector<string> spacer;
map<string, int> combos;
vector<string> barcodeNameVector;
map<string, int> groupCounts;
map<string, string> nameMap;
+ map<string, string> groupMap;
trimData(){}
- trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
+ trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
- int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
+ int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm, map<string, int> ncount) {
filename = fn;
qFileName = qn;
nameFile = nf;
+ countfile = cf;
trimFileName = tn;
scrapFileName = sn;
trimQFileName = tqn;
scrapQFileName = sqn;
trimNFileName = tnn;
scrapNFileName = snn;
+ trimCFileName = tcn;
+ scrapCFileName = scn;
groupFileName = gn;
fastaFileNames = ffn;
qualFileNames = qfn;
qlineStart = qstart;
qlineEnd = qend;
m = mout;
+ nameCount = ncount;
pdiffs = pd;
bdiffs = bd;
ofstream outGroupsFile;
- if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
+ if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
if(pDataArray->allFiles){
for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
}
}
+ ofstream trimCountFile;
+ ofstream scrapCountFile;
+ if(pDataArray->countfile != ""){
+ pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
+ pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
+ if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
+ }
+
ifstream inFASTA;
pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
if (pDataArray->m->control_pressed) {
inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
- if (pDataArray->createGroup) { outGroupsFile.close(); }
+ if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); }
+ if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
+ if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
+ if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
+
if(pDataArray->qFileName != ""){ qFile.close(); }
return 0;
}
if(success > pDataArray->bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
-
+
if(pDataArray->numSpacers != 0){
success = trimOligos.stripSpacer(currSeq, currQual);
if(success > pDataArray->sdiffs) { trashCode += 's'; }
else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
}
+ int numRedundants = 0;
+ if (pDataArray->countfile != "") {
+ map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
+ if (itCount != pDataArray->nameCount.end()) {
+ trimCountFile << itCount->first << '\t' << itCount->second << endl;
+ numRedundants = itCount->second-1;
+ }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
+ }
+
if (pDataArray->createGroup) {
if(pDataArray->barcodes.size() != 0){
string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
}
}
- outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
+ if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
+ else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
if (pDataArray->nameFile != "") {
map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
if (itName != pDataArray->nameMap.end()) {
vector<string> thisSeqsNames;
pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
+ numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
}
}
map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
- if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; }
- else { pDataArray->groupCounts[it->first]++; }
+ if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
+ else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
}
}
if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
}
+ if (pDataArray->countfile != "") {
+ map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
+ if (itCount != pDataArray->nameCount.end()) {
+ trimCountFile << itCount->first << '\t' << itCount->second << endl;
+ }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
+ }
currSeq.setName(currSeq.getName() + '|' + trashCode);
currSeq.setUnaligned(origSeq);
currSeq.setAligned(origSeq);