~seqPNode() {}
};
/************************************************************/
-inline bool comparePriority(seqPNode first, seqPNode second) {
+inline bool comparePriorityTopDown(seqPNode first, seqPNode second) {
if (first.numIdentical > second.numIdentical) { return true; }
else if (first.numIdentical == second.numIdentical) {
if (first.seq.getName() > second.seq.getName()) { return true; }
}
return false;
}
+/************************************************************/
+inline bool comparePriorityDownTop(seqPNode first, seqPNode second) {
+ if (first.numIdentical < second.numIdentical) { return true; }
+ else if (first.numIdentical == second.numIdentical) {
+ if (first.seq.getName() > second.seq.getName()) { return true; }
+ }
+ return false;
+}
//************************************************************/
class PreClusterCommand : public Command {
vector<string> setParameters();
string getCommandName() { return "pre.cluster"; }
string getCommandCategory() { return "Sequence Processing"; }
- string getOutputFileNameTag(string, string);
+
string getHelpString();
- string getCitation() { return "http://www.mothur.org/wiki/Pre.cluster"; }
+ string getOutputPattern(string);
+ string getCitation() { return "Schloss PD, Gevers D, Westcott SL (2011). Reducing the effects of PCR amplification and sequencing artifacts on 16S rRNA-based studies. PLoS ONE. 6:e27310.\nhttp://www.mothur.org/wiki/Pre.cluster"; }
string getDescription() { return "implements a pseudo-single linkage algorithm with the goal of removing sequences that are likely due to pyrosequencing errors"; }
CountTable ct;
int diffs, length, processors;
- bool abort, bygroup;
+ bool abort, bygroup, topdown;
string fastafile, namefile, outputDir, groupfile, countfile;
vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
map<string, string> names; //represents the names file first column maps to second column
map<string, int>::iterator itSize;
// map<string, bool> active; //maps sequence name to whether it has already been merged or not.
vector<string> outputNames;
- map<string, vector<string> > outputTypes;
int readFASTA();
void readNameFile();
string newFName, newNName, newMName;
MothurOut* m;
int start;
- int end;
+ int end, count;
int diffs, threadID;
vector<string> groups;
vector<string> mapFileNames;
+ bool topdown;
preClusterData(){}
- preClusterData(string f, string n, string g, string c, string nff, string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, int tid) {
+ preClusterData(string f, string n, string g, string c, string nff, string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, bool td, int tid) {
fastafile = f;
namefile = n;
groupfile = g;
threadID = tid;
groups = gr;
countfile = c;
+ topdown = td;
+ count=0;
}
};
//precluster each group
for (int k = pDataArray->start; k < pDataArray->end; k++) {
+ pDataArray->count++;
+
int start = time(NULL);
if (pDataArray->m->control_pressed) { delete parser; return 0; }
pDataArray->m->openOutputFile(pDataArray->newMName+pDataArray->groups[k]+".map", out);
pDataArray->mapFileNames.push_back(pDataArray->newMName+pDataArray->groups[k]+".map");
- //sort seqs by number of identical seqs
- sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
-
+ //sort seqs by number of identical seqs
+ if (pDataArray->topdown) { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityTopDown); }
+ else { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityDownTop); }
+
int count = 0;
//think about running through twice...