}
}
}
-
- }
+ }
}
catch(exception& e) {
m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
string baseTName = taxonomyFileName;
if (taxonomyFileName == "saved") {baseTName = rdb->getSavedTaxonomy(); }
- string RippedTaxName = m->getRootName(m->getSimpleName(baseTName));
- RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1));
- if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); }
- if (RippedTaxName != "") { RippedTaxName += "."; }
-
+ //set rippedTaxName to
+ string RippedTaxName = "";
+ bool foundDot = false;
+ for (int i = baseTName.length()-1; i >= 0; i--) {
+ cout << baseTName[i] << endl;
+ if (foundDot && (baseTName[i] != '.')) { RippedTaxName = baseTName[i] + RippedTaxName; }
+ else if (foundDot && (baseTName[i] == '.')) { break; }
+ else if (!foundDot && (baseTName[i] == '.')) { foundDot = true; }
+ }
+ if (RippedTaxName != "") { RippedTaxName += "."; }
+
if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy";
string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "flip.accnos";
vector< vector< vector<seqDist> > > calcDistsTotals; //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files
vector< vector<seqDist> > calcDists; calcDists.resize(matrixCalculators.size());
- for (int thisIter = 0; thisIter < iters; thisIter++) {
+ for (int thisIter = 0; thisIter < iters+1; thisIter++) {
vector<SharedRAbundVector*> thisItersLookup = thisLookup;
- if (subsample) {
+ if (subsample && (thisIter != 0)) {
SubSample sample;
vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
#endif
}
- calcDistsTotals.push_back(calcDists);
-
- if (subsample) {
-
+ if (subsample && (thisIter != 0)) {
+ calcDistsTotals.push_back(calcDists);
//clean up memory
for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
thisItersLookup.clear();
for (int i = 0; i < calcDists.size(); i++) { calcDists[i].clear(); }
+ }else { //print results for whole dataset
+ for (int i = 0; i < calcDists.size(); i++) {
+ if (m->control_pressed) { break; }
+
+ //initialize matrix
+ vector< vector<double> > matrix; //square matrix to represent the distance
+ matrix.resize(thisLookup.size());
+ for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
+
+ for (int j = 0; j < calcDists[i].size(); j++) {
+ int row = calcDists[i][j].seq1;
+ int column = calcDists[i][j].seq2;
+ double dist = calcDists[i][j].dist;
+
+ matrix[row][column] = dist;
+ matrix[column][row] = dist;
+ }
+
+ string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist";
+ outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+ ofstream outDist;
+ m->openOutputFile(distFileName, outDist);
+ outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+
+ printSims(outDist, matrix);
+
+ outDist.close();
+ }
}
}
outSTD.close();
}
- }else {
-
- for (int i = 0; i < calcDists.size(); i++) {
- if (m->control_pressed) { break; }
-
- //initialize matrix
- vector< vector<double> > matrix; //square matrix to represent the distance
- matrix.resize(thisLookup.size());
- for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
-
- for (int j = 0; j < calcDists[i].size(); j++) {
- int row = calcDists[i][j].seq1;
- int column = calcDists[i][j].seq2;
- double dist = calcDists[i][j].dist;
-
- matrix[row][column] = dist;
- matrix[column][row] = dist;
- }
-
- string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist";
- outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
- ofstream outDist;
- m->openOutputFile(distFileName, outDist);
- outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
-
- printSims(outDist, matrix);
-
- outDist.close();
- }
}
return 0;
map<string, set<int> > labelToEnds;
if ((format != "sharedfile")) { inputFileNames.push_back(inputfile); }
else { inputFileNames = parseSharedFile(sharedfile, labelToEnds); format = "rabund"; }
- for (map<string, set<int> >::iterator it = labelToEnds.begin(); it != labelToEnds.end(); it++) {
- cout << it->first << endl;
- for (set<int>::iterator its = (it->second).begin(); its != (it->second).end(); its++) {
- cout << (*its) << endl;
- }
- }
- if (m->control_pressed) { return 0; }
+
+ if (m->control_pressed) { return 0; }
map<int, string> file2Group; //index in outputNames[i] -> group
for (int p = 0; p < inputFileNames.size(); p++) {
CommandParameter pjumble("jumble", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pjumble);
CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets);
- CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter pgroupmode("groupmode", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pgroupmode);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
vector<string> myArray;
try {
string helpString = "";
ValidCalculators validCalculator;
- helpString += "The collect.shared command parameters are shared, label, freq, calc and groups. shared is required if there is no current sharedfile. \n";
- helpString += "The rarefaction.shared command parameters are shared, design, label, iters, groups, sets, jumble and calc. shared is required if there is no current sharedfile. \n";
+ helpString += "The rarefaction.shared command parameters are shared, design, label, iters, groups, sets, jumble, groupmode and calc. shared is required if there is no current sharedfile. \n";
helpString += "The design parameter allows you to assign your groups to sets. If provided mothur will run rarefaction.shared on a per set basis. \n";
helpString += "The sets parameter allows you to specify which of the sets in your designfile you would like to analyze. The set names are separated by dashes. THe default is all sets in the designfile.\n";
helpString += "The rarefaction command should be in the following format: \n";
if (m->isTrue(temp)) { jumble = true; }
else { jumble = false; }
m->jumble = jumble;
+
+ temp = validParameter.validFile(parameters, "groupmode", false); if (temp == "not found") { temp = "T"; }
+ groupMode = m->isTrue(temp);
}
for (int i = 0; i < Sets.size(); i++) {
process(designMap, Sets[i]);
}
+
+ if (groupMode) { outputNames = createGroupFile(outputNames); }
}
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
outputNames.push_back(fileNameRoot+"shared.r_nseqs"); outputTypes["sharedr_nseqs"].push_back(fileNameRoot+"shared.r_nseqs");
}
}
+ file2Group[outputNames.size()-1] = thisSet;
}
//if the users entered no valid calculators don't execute command
}
}
//**********************************************************************************************************************
+vector<string> RareFactSharedCommand::createGroupFile(vector<string>& outputNames) {
+ try {
+
+ vector<string> newFileNames;
+
+ //find different types of files
+ map<string, map<string, string> > typesFiles;
+ map<string, vector< vector<string> > > fileLabels; //combofile name to labels. each label is a vector because it may be unique lci hci.
+ vector<string> groupNames;
+ for (int i = 0; i < outputNames.size(); i++) {
+
+ string extension = m->getExtension(outputNames[i]);
+ string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
+ m->mothurRemove(combineFileName); //remove old file
+
+ ifstream in;
+ m->openInputFile(outputNames[i], in);
+
+ string labels = m->getline(in);
+
+ istringstream iss (labels,istringstream::in);
+ string newLabel = ""; vector<string> theseLabels;
+ while(!iss.eof()) { iss >> newLabel; m->gobble(iss); theseLabels.push_back(newLabel); }
+ vector< vector<string> > allLabels;
+ vector<string> thisSet; thisSet.push_back(theseLabels[0]); allLabels.push_back(thisSet); thisSet.clear(); //makes "numSampled" its own grouping
+ for (int j = 1; j < theseLabels.size()-1; j++) {
+ if (theseLabels[j+1] == "lci") {
+ thisSet.push_back(theseLabels[j]);
+ thisSet.push_back(theseLabels[j+1]);
+ thisSet.push_back(theseLabels[j+2]);
+ j++; j++;
+ }else{ //no lci or hci for this calc.
+ thisSet.push_back(theseLabels[j]);
+ }
+ allLabels.push_back(thisSet);
+ thisSet.clear();
+ }
+ fileLabels[combineFileName] = allLabels;
+
+ map<string, map<string, string> >::iterator itfind = typesFiles.find(extension);
+ if (itfind != typesFiles.end()) {
+ (itfind->second)[outputNames[i]] = file2Group[i];
+ }else {
+ map<string, string> temp;
+ temp[outputNames[i]] = file2Group[i];
+ typesFiles[extension] = temp;
+ }
+ if (!(m->inUsersGroups(file2Group[i], groupNames))) { groupNames.push_back(file2Group[i]); }
+ }
+
+ //for each type create a combo file
+
+ for (map<string, map<string, string> >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) {
+
+ ofstream out;
+ string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first;
+ m->openOutputFileAppend(combineFileName, out);
+ newFileNames.push_back(combineFileName);
+ map<string, string> thisTypesFiles = it->second; //it->second maps filename to group
+ set<int> numSampledSet;
+
+ //open each type summary file
+ map<string, map<int, vector< vector<string> > > > files; //maps file name to lines in file
+ int maxLines = 0;
+ for (map<string, string>::iterator itFileNameGroup = thisTypesFiles.begin(); itFileNameGroup != thisTypesFiles.end(); itFileNameGroup++) {
+
+ string thisfilename = itFileNameGroup->first;
+ string group = itFileNameGroup->second;
+
+ ifstream temp;
+ m->openInputFile(thisfilename, temp);
+
+ //read through first line - labels
+ m->getline(temp); m->gobble(temp);
+
+ map<int, vector< vector<string> > > thisFilesLines;
+ while (!temp.eof()){
+ int numSampled = 0;
+ temp >> numSampled; m->gobble(temp);
+
+ vector< vector<string> > theseReads;
+ vector<string> thisSet; thisSet.push_back(toString(numSampled)); theseReads.push_back(thisSet); thisSet.clear();
+ for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //output thing like 0.03-A lci-A hci-A
+ vector<string> reads;
+ string next = "";
+ for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { //output modified labels
+ temp >> next; m->gobble(temp);
+ reads.push_back(next);
+ }
+ theseReads.push_back(reads);
+ }
+ thisFilesLines[numSampled] = theseReads;
+ m->gobble(temp);
+
+ numSampledSet.insert(numSampled);
+ }
+
+ files[group] = thisFilesLines;
+
+ //save longest file for below
+ if (maxLines < thisFilesLines.size()) { maxLines = thisFilesLines.size(); }
+
+ temp.close();
+ m->mothurRemove(thisfilename);
+ }
+
+ //output new labels line
+ out << fileLabels[combineFileName][0][0] << '\t';
+ for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //output thing like 0.03-A lci-A hci-A
+ for (int n = 0; n < groupNames.size(); n++) { // for each group
+ for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) { //output modified labels
+ out << fileLabels[combineFileName][k][l] << '-' << groupNames[n] << '\t';
+ }
+ }
+ }
+ out << endl;
+
+ //for each label
+ for (set<int>::iterator itNumSampled = numSampledSet.begin(); itNumSampled != numSampledSet.end(); itNumSampled++) {
+
+ out << (*itNumSampled) << '\t';
+
+ if (m->control_pressed) { break; }
+
+ for (int k = 1; k < fileLabels[combineFileName].size(); k++) { //each chunk
+ //grab data for each group
+ for (map<string, map<int, vector< vector<string> > > >::iterator itFileNameGroup = files.begin(); itFileNameGroup != files.end(); itFileNameGroup++) {
+
+ string group = itFileNameGroup->first;
+
+ map<int, vector< vector<string> > >::iterator itLine = files[group].find(*itNumSampled);
+ if (itLine != files[group].end()) {
+ for (int l = 0; l < (itLine->second)[k].size(); l++) {
+ out << (itLine->second)[k][l] << '\t';
+
+ }
+ }else {
+ for (int l = 0; l < fileLabels[combineFileName][k].size(); l++) {
+ out << "NA" << '\t';
+ }
+ }
+ }
+ }
+ out << endl;
+ }
+ out.close();
+ }
+
+ //return combine file name
+ return newFileNames;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RareFactSharedCommand", "createGroupFile");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
string format;
float freq;
- bool abort, allLines, jumble;
+ map<int, string> file2Group; //index in outputNames[i] -> group
+ bool abort, allLines, jumble, groupMode;
set<string> labels; //holds labels to be used
string label, calc, groups, outputDir, sharedfile, designfile;
vector<string> Estimators, Groups, outputNames, Sets;
int process(GroupMap&, string);
+ vector<string> createGroupFile(vector<string>&);
};
//divide the groups between the processors
vector<linePair> lines;
+ vector<int> numFilesToComplete;
int numFilesPerProcessor = filenames.size() / processors;
for (int i = 0; i < processors; i++) {
int startIndex = i * numFilesPerProcessor;
int endIndex = (i+1) * numFilesPerProcessor;
if(i == (processors - 1)){ endIndex = filenames.size(); }
lines.push_back(linePair(startIndex, endIndex));
+ numFilesToComplete.push_back((endIndex-startIndex));
}
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
process++;
}else if (pid == 0){
num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
+
+ //pass numSeqs to parent
+ ofstream out;
+ string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
+ m->openOutputFile(tempFile, out);
+ out << num << endl;
+ out.close();
+
exit(0);
}else {
m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
#endif
for (int i=0;i<processIDS.size();i++) {
+ ifstream in;
+ string tempFile = compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
+ m->openInputFile(tempFile, in);
+ if (!in.eof()) {
+ int tempNum = 0;
+ in >> tempNum;
+ if (tempNum != numFilesToComplete[i+1]) {
+ m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(numFilesToComplete[i+1]) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches. The flow files may be too large to process with multiple processors. \n");
+ }
+ }
+ in.close(); m->mothurRemove(tempFile);
+
if (compositeFASTAFileName != "") {
m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
try {
+ int numCompleted = 0;
+
for(int i=start;i<end;i++){
if (m->control_pressed) { break; }
}
}
+ numCompleted++;
m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
}
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
- return 0;
+ return numCompleted;
}catch(exception& e) {
m->errorOut(e, "ShhherCommand", "driver");
exit(1);
}
}
+//**********************************************************************************************************************
+int SubSample::getSample(SAbundVector*& sabund, int size) {
+ try {
+
+ OrderVector* order = new OrderVector();
+ *order = sabund->getOrderVector(NULL);
+
+ int numBins = order->getNumBins();
+ int thisSize = order->getNumSeqs();
+
+ if (thisSize > size) {
+ random_shuffle(order->begin(), order->end());
+
+ RAbundVector* rabund = new RAbundVector(numBins);
+ rabund->setLabel(order->getLabel());
-
+ for (int j = 0; j < size; j++) {
+
+ if (m->control_pressed) { delete order; delete rabund; return 0; }
+
+ int bin = order->get(j);
+
+ int abund = rabund->get(bin);
+ rabund->set(bin, (abund+1));
+ }
+
+ delete sabund;
+ sabund = new SAbundVector();
+ *sabund = rabund->getSAbundVector();
+ delete rabund;
+
+ }else if (thisSize < size) { m->mothurOut("[ERROR]: The size you requested is larger than the number of sequences in the sabund vector. You requested " + toString(size) + " and you only have " + toString(thisSize) + " seqs in your sabund vector.\n"); m->control_pressed = true; }
+
+ if (m->control_pressed) { return 0; }
+
+ delete order;
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSampleCommand", "getSample");
+ exit(1);
+ }
+}
//**********************************************************************************************************************
//Tree* getSample(Tree*, TreeMap*, map<string, string>, int); //creates new subsampled tree, destroys treemap so copy if needed.
Tree* getSample(Tree*, TreeMap*, TreeMap*, int, map<string, string>); //creates new subsampled tree. Uses first treemap to fill new treemap with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe".
+ int getSample(SAbundVector*&, int); //destroys sabundvector passed in, so copy it if you need it
private:
#include "boneh.h"
#include "solow.h"
#include "shen.h"
+#include "subsample.h"
//**********************************************************************************************************************
vector<string> SummaryCommand::setParameters(){
CommandParameter prabund("rabund", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(prabund);
CommandParameter psabund("sabund", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(psabund);
CommandParameter pshared("shared", "InputTypes", "", "", "LRSS", "LRSS", "none",false,false); parameters.push_back(pshared);
+ CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
+ CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
CommandParameter pcalc("calc", "Multiple", "sobs-chao-nseqs-coverage-ace-jack-shannon-shannoneven-npshannon-heip-smithwilson-simpson-simpsoneven-invsimpson-bootstrap-geometric-qstat-logseries-bergerparker-bstick-goodscoverage-efron-boneh-solow-shen", "sobs-chao-ace-jack-shannon-npshannon-simpson", "", "", "",true,false); parameters.push_back(pcalc);
CommandParameter pabund("abund", "Number", "", "10", "", "", "",false,false); parameters.push_back(pabund);
try {
string helpString = "";
ValidCalculators validCalculator;
- helpString += "The summary.single command parameters are list, sabund, rabund, shared, label, calc, abund and groupmode. list, sabund, rabund or shared is required unless you have a valid current file.\n";
+ helpString += "The summary.single command parameters are list, sabund, rabund, shared, subsample, iters, label, calc, abund and groupmode. list, sabund, rabund or shared is required unless you have a valid current file.\n";
helpString += "The summary.single command should be in the following format: \n";
helpString += "summary.single(label=yourLabel, calc=yourEstimators).\n";
helpString += "Example summary.single(label=unique-.01-.03, calc=sobs-chao-ace-jack-bootstrap-shannon-npshannon-simpson).\n";
helpString += validCalculator.printCalc("summary");
+ helpString += "The subsample parameter allows you to enter the size of the sample or you can set subsample=T and mothur will use the size of your smallest group in the case of a shared file. With a list, sabund or rabund file you must provide a subsample size.\n";
+ helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample.\n";
helpString += "The default value calc is sobs-chao-ace-jack-shannon-npshannon-simpson\n";
helpString += "If you are running summary.single with a shared file and would like your summary results collated in one file, set groupmode=t. (Default=true).\n";
helpString += "The label parameter is used to analyze specific labels in your input.\n";
temp = validParameter.validFile(parameters, "groupmode", false); if (temp == "not found") { temp = "T"; }
groupMode = m->isTrue(temp);
-
+ temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
+ m->mothurConvert(temp, iters);
+
+ temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; }
+ if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
+ else {
+ if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later
+ else { subsample = false; subsampleSize = -1; }
+ }
+
+ if (subsample == false) { iters = 1; }
+ else {
+ //if you did not set a samplesize and are not using a sharedfile
+ if ((subsampleSize == -1) && (format != "sharedfile")) { m->mothurOut("[ERROR]: If you want to subsample with a list, rabund or sabund file, you must provide the sample size. You can do this by setting subsample=yourSampleSize.\n"); abort=true; }
+ }
+
}
}
catch(exception& e) {
int numLines = 0;
int numCols = 0;
-
+ map<string, string> groupIndex;
+
for (int p = 0; p < inputFileNames.size(); p++) {
numLines = 0;
numCols = 0;
string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "summary";
+ string fileNameAve = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "ave";
+ string fileNameSTD = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p])) + "std";
outputNames.push_back(fileNameRoot); outputTypes["summary"].push_back(fileNameRoot);
+
+
if (inputFileNames.size() > 1) {
m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[p]); m->mothurOutEndLine(); m->mothurOutEndLine();
+ groupIndex[fileNameRoot] = groups[p];
}
sumCalculators.clear();
ofstream outputFileHandle;
m->openOutputFile(fileNameRoot, outputFileHandle);
outputFileHandle << "label";
+
+ ofstream outAve, outSTD;
+ if (subsample) {
+ m->openOutputFile(fileNameAve, outAve);
+ m->openOutputFile(fileNameSTD, outSTD);
+ outputNames.push_back(fileNameAve); outputTypes["ave"].push_back(fileNameAve);
+ outputNames.push_back(fileNameSTD); outputTypes["std"].push_back(fileNameSTD);
+ outAve << "label"; outSTD << "label";
+ outAve.setf(ios::fixed, ios::floatfield); outAve.setf(ios::showpoint);
+ outSTD.setf(ios::fixed, ios::floatfield); outSTD.setf(ios::showpoint);
+ if (inputFileNames.size() > 1) {
+ groupIndex[fileNameAve] = groups[p];
+ groupIndex[fileNameSTD] = groups[p];
+ }
+ }
input = new InputData(inputFileNames[p], format);
sabund = input->getSAbundVector();
for(int i=0;i<sumCalculators.size();i++){
if(sumCalculators[i]->getCols() == 1){
outputFileHandle << '\t' << sumCalculators[i]->getName();
+ if (subsample) { outAve << '\t' << sumCalculators[i]->getName(); outSTD << '\t' << sumCalculators[i]->getName(); }
numCols++;
}
else{
outputFileHandle << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci";
+ if (subsample) { outAve << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; outSTD << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; }
numCols += 3;
}
}
outputFileHandle << endl;
+ if (subsample) { outSTD << endl; outAve << endl; }
//if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
set<string> processedLabels;
set<string> userLabels = labels;
- if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete sabund; delete input; return 0; }
+
+
+ if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete sabund; delete input; return 0; }
while((sabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
- if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete sabund; delete input; return 0; }
+ if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete sabund; delete input; return 0; }
if(allLines == 1 || labels.count(sabund->getLabel()) == 1){
processedLabels.insert(sabund->getLabel());
userLabels.erase(sabund->getLabel());
- outputFileHandle << sabund->getLabel();
- for(int i=0;i<sumCalculators.size();i++){
- vector<double> data = sumCalculators[i]->getValues(sabund);
-
- if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete sabund; delete input; return 0; }
-
- outputFileHandle << '\t';
- sumCalculators[i]->print(outputFileHandle);
- }
- outputFileHandle << endl;
+ process(sabund, outputFileHandle, outAve, outSTD);
+
+ if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete sabund; delete input; return 0; }
numLines++;
}
processedLabels.insert(sabund->getLabel());
userLabels.erase(sabund->getLabel());
- outputFileHandle << sabund->getLabel();
- for(int i=0;i<sumCalculators.size();i++){
- vector<double> data = sumCalculators[i]->getValues(sabund);
-
- if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete sabund; delete input; return 0; }
-
- outputFileHandle << '\t';
- sumCalculators[i]->print(outputFileHandle);
- }
- outputFileHandle << endl;
+ process(sabund, outputFileHandle, outAve, outSTD);
+
+ if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete sabund; delete input; return 0; }
numLines++;
//restore real lastlabel to save below
sabund = input->getSAbundVector();
}
- if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete input; return 0; }
+ if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete input; return 0; }
//output error messages about any remaining user labels
set<string>::iterator it;
sabund = input->getSAbundVector(lastLabel);
m->mothurOut(sabund->getLabel()); m->mothurOutEndLine();
- outputFileHandle << sabund->getLabel();
- for(int i=0;i<sumCalculators.size();i++){
- vector<double> data = sumCalculators[i]->getValues(sabund);
-
- if (m->control_pressed) { outputFileHandle.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete sabund; delete input; return 0; }
-
- outputFileHandle << '\t';
- sumCalculators[i]->print(outputFileHandle);
- }
- outputFileHandle << endl;
+ process(sabund, outputFileHandle, outAve, outSTD);
+
+ if (m->control_pressed) { outputFileHandle.close(); outAve.close(); outSTD.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete sabund; delete input; return 0; }
numLines++;
delete sabund;
}
outputFileHandle.close();
+ if (subsample) { outAve.close(); outSTD.close(); }
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } delete input; return 0; }
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
//create summary file containing all the groups data for each label - this function just combines the info from the files already created.
- if ((sharedfile != "") && (groupMode)) { outputNames.push_back(createGroupSummaryFile(numLines, numCols, outputNames)); }
+ if ((sharedfile != "") && (groupMode)) { vector<string> comboNames = createGroupSummaryFile(numLines, numCols, outputNames, groupIndex); for (int i = 0; i < comboNames.size(); i++) { outputNames.push_back(comboNames[i]); } }
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
}
}
//**********************************************************************************************************************
+int SummaryCommand::process(SAbundVector*& sabund, ofstream& outputFileHandle, ofstream& outAve, ofstream& outStd) {
+ try {
+
+ //calculator -> data -> values
+ vector< vector< vector<double> > > results; results.resize(sumCalculators.size());
+
+ outputFileHandle << sabund->getLabel();
+
+ SubSample sample;
+ for (int thisIter = 0; thisIter < iters+1; thisIter++) {
+
+ SAbundVector* thisIterSabund = sabund;
+
+ //we want the summary results for the whole dataset, then the subsampling
+ if ((thisIter > 0) && subsample) { //subsample sabund and run it
+ //copy sabund since getSample destroys it
+ RAbundVector rabund = sabund->getRAbundVector();
+ SAbundVector* newSabund = new SAbundVector();
+ *newSabund = rabund.getSAbundVector();
+
+ sample.getSample(newSabund, subsampleSize);
+ thisIterSabund = newSabund;
+ }
+
+ for(int i=0;i<sumCalculators.size();i++){
+ vector<double> data = sumCalculators[i]->getValues(thisIterSabund);
+
+ if (m->control_pressed) { return 0; }
+
+ if (thisIter == 0) {
+ outputFileHandle << '\t';
+ sumCalculators[i]->print(outputFileHandle);
+ }else {
+ //some of the calc have hci and lci need to make room for that
+ if (results[i].size() == 0) { results[i].resize(data.size()); }
+ //save results for ave and std.
+ for (int j = 0; j < data.size(); j++) {
+ if (m->control_pressed) { return 0; }
+ results[i][j].push_back(data[j]);
+ }
+ }
+ }
+
+ //cleanup memory
+ if ((thisIter > 0) && subsample) { delete thisIterSabund; }
+ }
+ outputFileHandle << endl;
+
+ if (subsample) {
+ outAve << sabund->getLabel() << '\t'; outStd << sabund->getLabel() << '\t';
+ //find ave and std for this label and output
+ //will need to modify the createGroupSummary to combine results and not mess with the .summary file.
+
+ //calcs -> values
+ vector< vector<double> > calcAverages; calcAverages.resize(sumCalculators.size());
+ for (int i = 0; i < calcAverages.size(); i++) { calcAverages[i].resize(results[i].size(), 0); }
+
+ for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
+ for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j] += results[i][j][thisIter];
+ }
+ }
+ }
+
+ for (int i = 0; i < calcAverages.size(); i++) { //finds average.
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j] /= (float) iters;
+ outAve << calcAverages[i][j] << '\t';
+ }
+ }
+
+ //find standard deviation
+ vector< vector<double> > stdDev; stdDev.resize(sumCalculators.size());
+ for (int i = 0; i < stdDev.size(); i++) { stdDev[i].resize(results[i].size(), 0); }
+
+ for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+ for (int i = 0; i < stdDev.size(); i++) {
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j] += ((results[i][j][thisIter] - calcAverages[i][j]) * (results[i][j][thisIter] - calcAverages[i][j]));
+ }
+ }
+ }
+
+ for (int i = 0; i < stdDev.size(); i++) { //finds average.
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j] /= (float) iters;
+ stdDev[i][j] = sqrt(stdDev[i][j]);
+ outStd << stdDev[i][j] << '\t';
+ }
+ }
+ outAve << endl; outStd << endl;
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SummaryCommand", "process");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
vector<string> SummaryCommand::parseSharedFile(string filename) {
try {
vector<string> filenames;
string sharedFileRoot = m->getRootName(filename);
- //clears file before we start to write to it below
+ /******************************************************/
+ if (subsample) {
+ if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
+ subsampleSize = lookup[0]->getNumSeqs();
+ for (int i = 1; i < lookup.size(); i++) {
+ int thisSize = lookup[i]->getNumSeqs();
+
+ if (thisSize < subsampleSize) { subsampleSize = thisSize; }
+ }
+ }else {
+ m->clearGroups();
+ vector<string> Groups;
+ vector<SharedRAbundVector*> temp;
+ for (int i = 0; i < lookup.size(); i++) {
+ if (lookup[i]->getNumSeqs() < subsampleSize) {
+ m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
+ delete lookup[i];
+ }else {
+ Groups.push_back(lookup[i]->getGroup());
+ temp.push_back(lookup[i]);
+ }
+ }
+ lookup = temp;
+ m->setGroups(Groups);
+ }
+
+ if (lookup.size() < 1) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; delete input; return filenames; }
+ }
+
+
+ /******************************************************/
+
+ //clears file before we start to write to it below
for (int i=0; i<lookup.size(); i++) {
m->mothurRemove((sharedFileRoot + lookup[i]->getGroup() + ".rabund"));
filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund"));
}
-
+
ofstream* temp;
for (int i=0; i<lookup.size(); i++) {
temp = new ofstream;
}
}
//**********************************************************************************************************************
-string SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector<string>& outputNames) {
+vector<string> SummaryCommand::createGroupSummaryFile(int numLines, int numCols, vector<string>& outputNames, map<string, string> groupIndex) {
try {
-
- ofstream out;
- string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups.summary";
-
- //open combined file
- m->openOutputFile(combineFileName, out);
-
+
//open each groups summary file
+ vector<string> newComboNames;
string newLabel = "";
- map<string, vector<string> > files;
+ map<string, map<string, vector<string> > > files;
for (int i=0; i<outputNames.size(); i++) {
+ string extension = m->getExtension(outputNames[i]);
+ string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
+ m->mothurRemove(combineFileName); //remove old file
+
vector<string> thisFilesLines;
-
+
ifstream temp;
m->openInputFile(outputNames[i], temp);
temp >> tempLabel;
//save for later
- if (j == 1) { thisLine += groups[i] + "\t" + tempLabel + "\t"; }
+ if (j == 1) { thisLine += groupIndex[outputNames[i]] + "\t" + tempLabel + "\t"; }
else{ thisLine += tempLabel + "\t"; }
}
m->gobble(temp);
}
-
- files[outputNames[i]] = thisFilesLines;
+
+ map<string, map<string, vector<string> > >::iterator itFiles = files.find(extension);
+ if (itFiles != files.end()) { //add new files info to existing type
+ files[extension][outputNames[i]] = thisFilesLines;
+ }else {
+ map<string, vector<string> > thisFile;
+ thisFile[outputNames[i]] = thisFilesLines;
+ files[extension] = thisFile;
+ }
temp.close();
m->mothurRemove(outputNames[i]);
}
- //output label line to new file
- out << newLabel << endl;
-
- //for each label
- for (int k = 0; k < numLines; k++) {
-
- //grab summary data for each group
- for (int i=0; i<outputNames.size(); i++) {
- out << files[outputNames[i]][k];
- }
- }
-
- outputNames.clear();
-
- out.close();
+
+ for (map<string, map<string, vector<string> > >::iterator itFiles = files.begin(); itFiles != files.end(); itFiles++) {
+
+ if (m->control_pressed) { break; }
+
+ string extension = itFiles->first;
+ map<string, vector<string> > thisType = itFiles->second;
+ string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
+ newComboNames.push_back(combineFileName);
+ //open combined file
+ ofstream out;
+ m->openOutputFile(combineFileName, out);
+
+ //output label line to new file
+ out << newLabel << endl;
+
+ //for each label
+ for (int k = 0; k < numLines; k++) {
+
+ //grab summary data for each group
+ for (map<string, vector<string> >::iterator itType = thisType.begin(); itType != thisType.end(); itType++) {
+ out << (itType->second)[k];
+ }
+ }
+
+ outputNames.clear();
+
+ out.close();
+ }
//return combine file name
- return combineFileName;
+ return newComboNames;
}
catch(exception& e) {
vector<Calculator*> sumCalculators;
InputData* input;
SAbundVector* sabund;
- int abund, size;
+ int abund, size, iters, subsampleSize;
- bool abort, allLines, groupMode;
+ bool abort, allLines, groupMode, subsample;
set<string> labels; //holds labels to be used
string label, calc, outputDir, sharedfile, listfile, rabundfile, sabundfile, format, inputfile;
vector<string> Estimators;
vector<string> groups;
vector<string> parseSharedFile(string);
- string createGroupSummaryFile(int, int, vector<string>&);
+ vector<string> createGroupSummaryFile(int, int, vector<string>&, map<string, string>);
+ int process(SAbundVector*&, ofstream&, ofstream&, ofstream&);
};
return 0;
}
/******************************************************/
-
+ if (subsample) {
+ if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
+ subsampleSize = lookup[0]->getNumSeqs();
+ for (int i = 1; i < lookup.size(); i++) {
+ int thisSize = lookup[i]->getNumSeqs();
+
+ if (thisSize < subsampleSize) { subsampleSize = thisSize; }
+ }
+ }else {
+ m->clearGroups();
+ Groups.clear();
+ vector<SharedRAbundVector*> temp;
+ for (int i = 0; i < lookup.size(); i++) {
+ if (lookup[i]->getNumSeqs() < subsampleSize) {
+ m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
+ delete lookup[i];
+ }else {
+ Groups.push_back(lookup[i]->getGroup());
+ temp.push_back(lookup[i]);
+ }
+ }
+ lookup = temp;
+ m->setGroups(Groups);
+ }
+
+ if (lookup.size() < 2) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; delete input; return 0; }
+ }
+
/******************************************************/
//comparison breakup to be used by different processes later
- numGroups = m->getNumGroups();
+ numGroups = lookup.size();
lines.resize(processors);
for (int i = 0; i < processors; i++) {
lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups);
vector< vector< vector<seqDist> > > calcDistsTotals; //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files
vector< vector<seqDist> > calcDists; calcDists.resize(sumCalculators.size());
- for (int thisIter = 0; thisIter < iters; thisIter++) {
+ for (int thisIter = 0; thisIter < iters+1; thisIter++) {
vector<SharedRAbundVector*> thisItersLookup = thisLookup;
- if (subsample) {
+ if (subsample && (thisIter != 0)) { //we want the summary results for the whole dataset, then the subsampling
SubSample sample;
vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
#endif
}
- calcDistsTotals.push_back(calcDists);
- if (subsample) {
+ if (subsample && (thisIter != 0)) { //we want the summary results for the whole dataset, then the subsampling
+ calcDistsTotals.push_back(calcDists);
//clean up memory
for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
thisItersLookup.clear();
for (int i = 0; i < calcDists.size(); i++) { calcDists[i].clear(); }
+ }else {
+ if (createPhylip) {
+ for (int i = 0; i < calcDists.size(); i++) {
+ if (m->control_pressed) { break; }
+
+ //initialize matrix
+ vector< vector<double> > matrix; //square matrix to represent the distance
+ matrix.resize(thisLookup.size());
+ for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
+
+ for (int j = 0; j < calcDists[i].size(); j++) {
+ int row = calcDists[i][j].seq1;
+ int column = calcDists[i][j].seq2;
+ double dist = calcDists[i][j].dist;
+
+ matrix[row][column] = dist;
+ matrix[column][row] = dist;
+ }
+
+ string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist";
+ outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+ ofstream outDist;
+ m->openOutputFile(distFileName, outDist);
+ outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+
+ printSims(outDist, matrix);
+
+ outDist.close();
+ }
+ }
}
}
outSTD.close();
}
- }else {
- if (createPhylip) {
- for (int i = 0; i < calcDists.size(); i++) {
- if (m->control_pressed) { break; }
-
- //initialize matrix
- vector< vector<double> > matrix; //square matrix to represent the distance
- matrix.resize(thisLookup.size());
- for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
-
- for (int j = 0; j < calcDists[i].size(); j++) {
- int row = calcDists[i][j].seq1;
- int column = calcDists[i][j].seq2;
- double dist = calcDists[i][j].dist;
-
- matrix[row][column] = dist;
- matrix[column][row] = dist;
- }
-
- string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist";
- outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
- ofstream outDist;
- m->openOutputFile(distFileName, outDist);
- outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
-
- printSims(outDist, matrix);
-
- outDist.close();
- }
- }
}
-
+
return 0;
}
catch(exception& e) {
#include "needlemanoverlap.hpp"
+/********************************************************************/
+//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, map<string, int> rbr, vector<string> r, vector<string> lk, vector<string> sp){
+ try {
+ m = MothurOut::getInstance();
+
+ pdiffs = p;
+ bdiffs = b;
+ ldiffs = l;
+ sdiffs = s;
+
+ barcodes = br;
+ rbarcodes = rbr;
+ primers = pr;
+ revPrimer = r;
+ linker = lk;
+ spacer = sp;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "TrimOligos");
+ exit(1);
+ }
+}
/********************************************************************/
//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
}
oligo = oligo.substr(0,alnLength);
temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
+ int numDiff = countDiffs(oligo, temp);
if(numDiff < minDiff){
minDiff = numDiff;
alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
oligo = alignment->getSeqAAln();
string temp = alignment->getSeqBAln();
-
+
int alnLength = oligo.length();
for(int i=oligo.length()-1;i>=0;i--){
}
oligo = oligo.substr(0,alnLength);
temp = temp.substr(0,alnLength);
-
+
int numDiff = countDiffs(oligo, temp);
if(numDiff < minDiff){
exit(1);
}
+}
+//*******************************************************************/
+int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ int success = bdiffs + 1; //guilty until proven innocent
+
+ //can you find the barcode
+ for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ string oligo = it->first;
+ if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){
+ group = it->second;
+ seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length())));
+
+ if(qual.getName() != ""){
+ qual.trimQScores(-1, rawSequence.length()-oligo.length());
+ }
+
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the barcode or if you don't want to allow for diffs
+ if ((bdiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (rbarcodes.size() > 0) {
+ map<string,int>::iterator it;
+
+ for(it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ if(it->first.length() > maxLength){
+ maxLength = it->first.length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minGroup = -1;
+ int minPos = 0;
+
+ for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ string oligo = it->first;
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=0;i<alnLength;i++){
+ if(oligo[i] != '-'){ alnLength = i; break; }
+ }
+ oligo = oligo.substr(alnLength);
+ temp = temp.substr(alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minGroup = it->second;
+ minPos = 0;
+ for(int i=alnLength-1;i>=0;i--){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > bdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
+ else{ //use the best match
+ group = minGroup;
+ seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos)));
+
+ if(qual.getName() != ""){
+ qual.trimQScores(-1, (rawSequence.length()-minPos));
+ }
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripRBarcode");
+ exit(1);
+ }
+
+}
+//*******************************************************************/
+int TrimOligos::stripRBarcode(Sequence& seq, int& group){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ int success = bdiffs + 1; //guilty until proven innocent
+
+ //can you find the barcode
+ for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ string oligo = it->first;
+ if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){
+ group = it->second;
+ seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length())));
+
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the barcode or if you don't want to allow for diffs
+ if ((bdiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (rbarcodes.size() > 0) {
+ map<string,int>::iterator it;
+
+ for(it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ if(it->first.length() > maxLength){
+ maxLength = it->first.length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minGroup = -1;
+ int minPos = 0;
+
+ for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ string oligo = it->first;
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = bdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=0;i<alnLength;i++){
+ if(oligo[i] != '-'){ alnLength = i; break; }
+ }
+ oligo = oligo.substr(alnLength);
+ temp = temp.substr(alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minGroup = it->second;
+ minPos = 0;
+ for(int i=alnLength-1;i>=0;i--){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > bdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
+ else{ //use the best match
+ group = minGroup;
+ seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos)));
+
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripRBarcode");
+ exit(1);
+ }
+
}
//********************************************************************/
int TrimOligos::stripForward(Sequence& seq, int& group){
public:
TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
- TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer
+ TrimOligos(int,int, int, int, map<string, int>, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
+ TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
~TrimOligos();
int stripBarcode(Sequence&, int&);
int stripBarcode(Sequence&, QualityScores&, int&);
+
+ int stripRBarcode(Sequence&, int&);
+ int stripRBarcode(Sequence&, QualityScores&, int&);
int stripForward(Sequence&, int&);
int stripForward(Sequence&, QualityScores&, int&, bool);
int pdiffs, bdiffs, ldiffs, sdiffs;
map<string, int> barcodes;
+ map<string, int> rbarcodes;
map<string, int> primers;
vector<string> revPrimer;
vector<string> linker;
int count = 0;
bool moreSeqs = 1;
- TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
+ TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
while (moreSeqs) {
if(success > bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
+
+ if(rbarcodes.size() != 0){
+ success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
+ if(success > bdiffs) { trashCode += 'b'; }
+ else{ currentSeqsDiffs += success; }
+ }
if(numSpacers != 0){
success = trimOligos.stripSpacer(currSeq, currQual);
tempPrimerQualFileNames,
tempNameFileNames,
lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
- pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
+ pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer,
primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
int startIndex = i * numSeqsPerProcessor;
if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
- cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
}
}
}
else if(type == "BARCODE"){
inOligos >> group;
+
+ //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs
+ //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info
+ string temp = "";
+ while (!inOligos.eof()) {
+ char c = inOligos.get();
+ if (c == 10 || c == 13){ break; }
+ else if (c == 32 || c == 9){;} //space or tab
+ else { temp += c; }
+ }
+ //then this is illumina data with 4 columns
+ if (temp != "") {
+ string reverseBarcode = reverseOligo(group); //reverse barcode
+ group = temp;
+
+ //check for repeat barcodes
+ map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
+ if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); }
+
+ rbarcodes[reverseBarcode]=indexBarcode;
+ }
+
//check for repeat barcodes
map<string, int>::iterator itBar = barcodes.find(oligo);
if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
vector<string> revPrimer, outputNames;
set<string> filesToRemove;
map<string, int> barcodes;
+ map<string, int> rbarcodes;
vector<string> groupVector;
map<string, int> primers;
vector<string> linker;
double qRollAverage, qThreshold, qWindowAverage, qAverage;
vector<string> revPrimer;
map<string, int> barcodes;
+ map<string, int> rbarcodes;
map<string, int> primers;
vector<string> linker;
vector<string> spacer;
trimData(){}
trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
- int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
+ int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa,
vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
sdiffs = sd;
tdiffs = td;
barcodes = bar;
+ rbarcodes = rbar;
primers = pri; numFPrimers = primers.size();
revPrimer = revP; numRPrimers = revPrimer.size();
linker = li; numLinkers = linker.size();
}
- TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
+ TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
pDataArray->count = pDataArray->lineEnd;
for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
if(success > pDataArray->bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
-
+
+ if(pDataArray->rbarcodes.size() != 0){
+ success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
+ if(success > pDataArray->bdiffs) { trashCode += 'b'; }
+ else{ currentSeqsDiffs += success; }
+ }
+
if(pDataArray->numSpacers != 0){
success = trimOligos.stripSpacer(currSeq, currQual);
if(success > pDataArray->sdiffs) { trashCode += 's'; }