#include "command.hpp"
#include "sequence.hpp"
#include "sequenceparser.h"
+#include "sequencecountparser.h"
/************************************************************/
struct seqPNode {
~seqPNode() {}
};
/************************************************************/
-inline bool comparePriority(seqPNode first, seqPNode second) { return (first.numIdentical > second.numIdentical); }
+inline bool comparePriority(seqPNode first, seqPNode second) {
+ if (first.numIdentical > second.numIdentical) { return true; }
+ else if (first.numIdentical == second.numIdentical) {
+ if (first.seq.getName() > second.seq.getName()) { return true; }
+ }
+ return false;
+}
//************************************************************/
class PreClusterCommand : public Command {
vector<string> setParameters();
string getCommandName() { return "pre.cluster"; }
string getCommandCategory() { return "Sequence Processing"; }
+
string getHelpString();
- string getCitation() { return "http://www.mothur.org/wiki/Pre.cluster"; }
+ string getOutputPattern(string);
+ string getCitation() { return "Schloss PD, Gevers D, Westcott SL (2011). Reducing the effects of PCR amplification and sequencing artifacts on 16S rRNA-based studies. PLoS ONE. 6:e27310.\nhttp://www.mothur.org/wiki/Pre.cluster"; }
string getDescription() { return "implements a pseudo-single linkage algorithm with the goal of removing sequences that are likely due to pyrosequencing errors"; }
linePair(int i, int j) : start(i), end(j) {}
};
+ SequenceParser* parser;
+ SequenceCountParser* cparser;
+ CountTable ct;
+
int diffs, length, processors;
bool abort, bygroup;
- string fastafile, namefile, outputDir, groupfile;
+ string fastafile, namefile, outputDir, groupfile, countfile;
vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
map<string, string> names; //represents the names file first column maps to second column
map<string, int> sizes; //this map a seq name to the number of identical seqs in the names file
void readNameFile();
//int readNamesFASTA();
int calcMisMatches(string, string);
- void printData(string, string); //fasta filename, names file name
+ void printData(string, string, string); //fasta filename, names file name
int process(string);
- int loadSeqs(map<string, string>&, vector<Sequence>&);
- int driverGroups(SequenceParser*, string, string, string, int, int, vector<string> groups);
- int createProcessesGroups(SequenceParser*, string, string, string, vector<string>);
+ int loadSeqs(map<string, string>&, vector<Sequence>&, string);
+ int driverGroups(string, string, string, int, int, vector<string> groups);
+ int createProcessesGroups(string, string, string, vector<string>);
+ int mergeGroupCounts(string, string, string);
};
/**************************************************************************************************/
struct preClusterData {
string fastafile;
string namefile;
- string groupfile;
+ string groupfile, countfile;
string newFName, newNName, newMName;
MothurOut* m;
int start;
vector<string> mapFileNames;
preClusterData(){}
- preClusterData(string f, string n, string g, string nff, string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, int tid) {
+ preClusterData(string f, string n, string g, string c, string nff, string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, int tid) {
fastafile = f;
namefile = n;
groupfile = g;
diffs = d;
threadID = tid;
groups = gr;
+ countfile = c;
}
};
//parse fasta and name file by group
SequenceParser* parser;
- if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
- else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); }
-
- int numSeqs = 0;
+ SequenceCountParser* cparser;
+ if (pDataArray->countfile != "") {
+ cparser = new SequenceCountParser(pDataArray->countfile, pDataArray->fastafile);
+ }else {
+ if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
+ else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); }
+ }
+
+ int numSeqs = 0;
vector<seqPNode> alignSeqs;
//clear out old files
ofstream outF; pDataArray->m->openOutputFile(pDataArray->newFName, outF); outF.close();
pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine();
map<string, string> thisNameMap;
- if (pDataArray->namefile != "") { thisNameMap = parser->getNameMap(pDataArray->groups[k]); }
- vector<Sequence> thisSeqs = parser->getSeqs(pDataArray->groups[k]);
+ vector<Sequence> thisSeqs;
+ if (pDataArray->groupfile != "") {
+ thisSeqs = parser->getSeqs(pDataArray->groups[k]);
+ }else if (pDataArray->countfile != "") {
+ thisSeqs = cparser->getSeqs(pDataArray->groups[k]);
+ }
+ if (pDataArray->namefile != "") { thisNameMap = parser->getNameMap(pDataArray->groups[k]); }
//fill alignSeqs with this groups info.
////////////////////////////////////////////////////
alignSeqs.clear();
map<string, string>::iterator it;
bool error = false;
+ map<string, int> thisCount;
+ if (pDataArray->countfile != "") { thisCount = cparser->getCountTable(pDataArray->groups[k]); }
+
for (int i = 0; i < thisSeqs.size(); i++) {
if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); }
}
}else { //no names file, you are identical to yourself
- seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName());
- alignSeqs.push_back(tempNode);
+ int numRep = 1;
+ if (pDataArray->countfile != "") {
+ map<string, int>::iterator it2 = thisCount.find(thisSeqs[i].getName());
+
+ //should never be true since parser checks for this
+ if (it2 == thisCount.end()) { pDataArray->m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); error = true; }
+ else { numRep = it2->second; }
+ }
+ seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName());
+ alignSeqs.push_back(tempNode);
if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); }
}
}
for (int i = 0; i < alignSeqs.size(); i++) {
if (alignSeqs[i].numIdentical != 0) {
alignSeqs[i].seq.printSequence(outFasta);
- outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
+ if (pDataArray->countfile != "") { outNames << pDataArray->groups[k] << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
+ }else { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; }
+
}
}