numSeqs = templateSequences.size();
if (!silent) { m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); }
-
+
}else {
int start = time(NULL);
m->mothurOutEndLine();
m->mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush();
-
+ bool aligned = false;
+ int tempLength = 0;
+
#ifdef USE_MPI
int pid, processors;
vector<unsigned long long> positions;
//save longest base
if (temp.getUnaligned().length() >= longest) { longest = temp.getUnaligned().length()+1; }
+ if (tempLength != 0) {
+ if (tempLength != temp.getAligned().length()) { m->mothurOut("[ERROR]: template is not aligned, aborting.\n"); m->control_pressed=true; }
+ }else { tempLength = temp.getAligned().length(); }
}
}
//save longest base
if (temp.getUnaligned().length() >= longest) { longest = (temp.getUnaligned().length()+1); }
+
+ if (tempLength != 0) {
+ if (tempLength != temp.getAligned().length()) { m->mothurOut("[ERROR]: template is not aligned, aborting.\n"); m->control_pressed=true; }
+ }else { tempLength = temp.getAligned().length(); }
}
}
fastaFile.close();
*
*/
-#include "alignNode.h"
+#include "alignnode.h"
#include "taxonomynode.h"
#include "bayesian.h"
#include "amovacommand.h"
#include "readphylipvector.h"
#include "groupmap.h"
+#include "sharedutilities.h"
//**********************************************************************************************************************
vector<string> AmovaCommand::setParameters(){
try {
CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pdesign);
+ CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets);
CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pphylip);
CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
CommandParameter palpha("alpha", "Number", "", "0.05", "", "", "",false,false); parameters.push_back(palpha);
string helpString = "";
helpString += "Referenced: Anderson MJ (2001). A new method for non-parametric multivariate analysis of variance. Austral Ecol 26: 32-46.";
helpString += "The amova command outputs a .amova file.";
- helpString += "The amova command parameters are phylip, iters, and alpha. The phylip and design parameters are required, unless you have valid current files.";
+ helpString += "The amova command parameters are phylip, iters, sets and alpha. The phylip and design parameters are required, unless you have valid current files.";
helpString += "The design parameter allows you to assign your samples to groups when you are running amova. It is required.";
helpString += "The design file looks like the group file. It is a 2 column tab delimited file, where the first column is the sample name and the second column is the group the sample belongs to.";
+ helpString += "The sets parameter allows you to specify which of the sets in your designfile you would like to analyze. The set names are separated by dashes. THe default is all sets in the designfile.\n";
helpString += "The iters parameter allows you to set number of randomization for the P value. The default is 1000.";
helpString += "The amova command should be in the following format: amova(phylip=file.dist, design=file.design).";
helpString += "Note: No spaces between parameter labels (i.e. iters), '=' and parameters (i.e. 1000).";
temp = validParameter.validFile(parameters, "alpha", false);
if (temp == "not found") { temp = "0.05"; }
m->mothurConvert(temp, experimentwiseAlpha);
+
+ string sets = validParameter.validFile(parameters, "sets", false);
+ if (sets == "not found") { sets = ""; }
+ else {
+ m->splitAtDash(sets, Sets);
+ }
}
}
catch(exception& e) {
//read in distance matrix and square it
ReadPhylipVector readMatrix(phylipFileName);
vector<string> sampleNames = readMatrix.read(distanceMatrix);
-
+
+ if (Sets.size() != 0) { //user selected sets, so we want to remove the samples not in those sets
+ SharedUtil util;
+ vector<string> dGroups = designMap->getNamesOfGroups();
+ util.setGroups(Sets, dGroups);
+
+ for(int i=0;i<distanceMatrix.size();i++){
+
+ if (m->control_pressed) { delete designMap; return 0; }
+
+ string group = designMap->getGroup(sampleNames[i]);
+
+ if (group == "not found") {
+ m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+ }else if (!m->inUsersGroups(group, Sets)){ //not in set we want remove it
+ //remove from all other rows
+ for(int j=0;j<distanceMatrix.size();j++){
+ distanceMatrix[j].erase(distanceMatrix[j].begin()+i);
+ }
+ distanceMatrix.erase(distanceMatrix.begin()+i);
+ sampleNames.erase(sampleNames.begin()+i);
+ i--;
+ }
+ }
+ }
+
for(int i=0;i<distanceMatrix.size();i++){
for(int j=0;j<i;j++){
distanceMatrix[i][j] *= distanceMatrix[i][j];
map<string, vector<int> > getRandomizedGroups(map<string, vector<int> >);
bool abort;
- vector<string> outputNames;
+ vector<string> outputNames, Sets;
string outputDir, inputDir, designFileName, phylipFileName;
GroupMap* designMap;
string name = "";
string chimeraFlag = "";
- in >> chimeraFlag >> name;
-
- //fix name if needed
- if (templatefile == "self") {
- name = name.substr(0, name.length()-1); //rip off last /
- name = name.substr(0, name.find_last_of('/'));
+ //in >> chimeraFlag >> name;
+
+ string line = m->getline(in);
+ vector<string> pieces = m->splitWhiteSpace(line);
+ if (pieces.size() > 2) {
+ name = pieces[1];
+ //fix name if needed
+ if (templatefile == "self") {
+ name = name.substr(0, name.length()-1); //rip off last /
+ name = name.substr(0, name.find_last_of('/'));
+ }
+
+ chimeraFlag = pieces[pieces.size()-1];
}
-
- for (int i = 0; i < 15; i++) { in >> chimeraFlag; }
+ //for (int i = 0; i < 15; i++) { in >> chimeraFlag; }
m->gobble(in);
if (chimeraFlag == "Y") { out << name << endl; numChimeras++; }
if (tfile == "saved") { tfile = rdb->getSavedTaxonomy(); }
taxFile = tfile;
- readTaxonomy(taxFile);
+
int numSeqs = 0;
if (tempFile == "saved") {
database->setNumSeqs(numSeqs);
- //sanity check
- bool okay = phyloTree->ErrorCheck(names);
-
- if (!okay) { m->control_pressed = true; }
-
m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences and generate the search databases.");m->mothurOutEndLine();
}else {
fastaFile.close();
}
#endif
-
+
database->setNumSeqs(names.size());
- //sanity check
- bool okay = phyloTree->ErrorCheck(names);
-
- if (!okay) { m->control_pressed = true; }
-
m->mothurOut("DONE."); m->mothurOutEndLine();
m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
}
-
+
+ readTaxonomy(taxFile);
+
+ //sanity check
+ bool okay = phyloTree->ErrorCheck(names);
+
+ if (!okay) { m->control_pressed = true; }
}
catch(exception& e) {
m->errorOut(e, "Classify", "generateDatabaseAndNames");
iss >> name; m->gobble(iss);
iss >> taxInfo;
if (m->debug) { m->mothurOut("[DEBUG]: name = " + name + " tax = " + taxInfo + "\n"); }
- taxonomy[name] = taxInfo;
- phyloTree->addSeqToTree(name, taxInfo);
- }
+ if (m->inUsersGroups(name, names)) {
+ taxonomy[name] = taxInfo;
+ phyloTree->addSeqToTree(name, taxInfo);
+ }else {
+ m->mothurOut("[WARNING]: " + name + " is in your taxonomy file and not in your reference file, ignoring.\n");
+ }
+ }
MPI_File_close(&inMPI);
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
taxonomy.clear();
m->readTax(file, taxonomy);
- for (map<string, string>::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) { phyloTree->addSeqToTree(itTax->first, itTax->second); }
+ map<string, string> tempTaxonomy;
+ for (map<string, string>::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) {
+ if (m->inUsersGroups(itTax->first, names)) {
+ phyloTree->addSeqToTree(itTax->first, itTax->second);
+ tempTaxonomy[itTax->first] = itTax->second;
+ }else {
+ m->mothurOut("[WARNING]: " + itTax->first + " is in your taxonomy file and not in your reference file, ignoring.\n");
+ }
+ }
+ taxonomy = tempTaxonomy;
#endif
phyloTree->assignHeirarchyIDs(0);
}
outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile);
- outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile);
outputNames.push_back(taxSummary); outputTypes["taxsummary"].push_back(taxSummary);
int start = time(NULL);
}
#endif
- if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur suspects some of your sequences may be reversed, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine(); }
+ if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur suspects some of your sequences may be reversed, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine();
+ outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile);
+ }else { m->mothurRemove(newaccnosFile); }
m->mothurOutEndLine();
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
#ifdef USE_MPI
}
#endif
-
- m->mothurOutEndLine();
- m->mothurOut("Output File Names: "); m->mothurOutEndLine();
- for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
- m->mothurOutEndLine();
}
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
//set taxonomy file as new current taxonomyfile
string current = "";
//make classify
Classify* myclassify;
+ string outputMethodTag = pDataArray->method + ".";
if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); }
else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); }
else if(pDataArray->method == "zap"){
- outputMethodTag = search + "_" + outputMethodTag;
+ outputMethodTag = pDataArray->search + "_" + outputMethodTag;
if (pDataArray->search == "kmer") { myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); }
else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); }
}
//zero out counts that don't make the cutoff
float percentage = (100.0 - cutoff) / 100.0;
- int zeroCutoff = percentage * size;
-
+
for (int i = 0; i < counts.size(); i++) {
- if (counts[i] < zeroCutoff) { counts[i] = 0; }
+ float countPercentage = counts[i] / (float) size;
+ if (countPercentage < percentage) { counts[i] = 0; }
}
//any
int nrows = numOTUS;//rows of inital matrix
int ncols = thisLookUp.size();//groups
double initscore = 0.0;
-
+
vector<double> stats;
- double probabilityMatrix[ncols * nrows];
+ vector<double> probabilityMatrix; probabilityMatrix.resize(ncols * nrows, 0);
vector<vector<int> > nullmatrix(nrows, vector<int>(ncols, 0));
-
+
TrialSwap2 trial;
int n = accumulate( columntotal.begin(), columntotal.end(), 0 );
m->openInputFile(metadatafile, in);
string headerLine = m->getline(in); m->gobble(in);
- istringstream iss (headerLine,istringstream::in);
-
- //read the first label, because it refers to the groups
- string columnLabel;
- iss >> columnLabel; m->gobble(iss);
+ vector<string> pieces = m->splitWhiteSpace(headerLine);
//save names of columns you are reading
- while (!iss.eof()) {
- iss >> columnLabel; m->gobble(iss);
- metadataLabels.push_back(columnLabel);
+ for (int i = 1; i < pieces.size(); i++) {
+ metadataLabels.push_back(pieces[i]);
}
int count = metadataLabels.size();
exit(1);
}
}
+/***********************************************************************/
+
+int FormatColumnMatrix::read(CountTable* nameMap){
+ try {
+
+ string firstName, secondName;
+ float distance;
+ int nseqs = nameMap->size();
+
+ list = new ListVector(nameMap->getListVector());
+
+ Progress* reading = new Progress("Formatting matrix: ", nseqs * nseqs);
+
+ int lt = 1;
+ int refRow = 0; //we'll keep track of one cell - Cell(refRow,refCol) - and see if it's transpose
+ int refCol = 0; //shows up later - Cell(refCol,refRow). If it does, then its a square matrix
+
+ //need to see if this is a square or a triangular matrix...
+
+ ofstream out;
+ string tempOutFile = filename + ".temp";
+ m->openOutputFile(tempOutFile, out);
+
+ while(fileHandle && lt == 1){ //let's assume it's a triangular matrix...
+
+ if (m->control_pressed) { out.close(); m->mothurRemove(tempOutFile); fileHandle.close(); delete reading; return 0; }
+
+ fileHandle >> firstName >> secondName >> distance; // get the row and column names and distance
+
+ int itA = nameMap->get(firstName);
+ int itB = nameMap->get(secondName);
+
+ if (distance == -1) { distance = 1000000; }
+
+ if((distance < cutoff) && (itA != itB)){
+ if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol...
+ refRow = itA;
+ refCol = itB;
+
+ //making it square
+ out << itA << '\t' << itB << '\t' << distance << endl;
+ out << itB << '\t' << itA << '\t' << distance << endl;
+ }
+ else if(refRow == itA && refCol == itB){ lt = 0; } //you are square
+ else if(refRow == itB && refCol == itA){ lt = 0; } //you are square
+ else{ //making it square
+ out << itA << '\t' << itB << '\t' << distance << endl;
+ out << itB << '\t' << itA << '\t' << distance << endl;
+ }
+
+ reading->update(itA * nseqs / 2);
+ }
+ m->gobble(fileHandle);
+ }
+ out.close();
+ fileHandle.close();
+
+ string squareFile;
+ if(lt == 0){ // oops, it was square
+ squareFile = filename;
+ }else{ squareFile = tempOutFile; }
+
+ //sort file by first column so the distances for each row are together
+ string outfile = m->getRootName(squareFile) + "sorted.dist.temp";
+
+ //use the unix sort
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ string command = "sort -n " + squareFile + " -o " + outfile;
+ system(command.c_str());
+#else //sort using windows sort
+ string command = "sort " + squareFile + " /O " + outfile;
+ system(command.c_str());
+#endif
+
+ if (m->control_pressed) { m->mothurRemove(tempOutFile); m->mothurRemove(outfile); delete reading; return 0; }
+
+ //output to new file distance for each row and save positions in file where new row begins
+ ifstream in;
+ m->openInputFile(outfile, in);
+
+ distFile = outfile + ".rowFormatted";
+ m->openOutputFile(distFile, out);
+
+ rowPos.resize(nseqs, -1);
+ int currentRow;
+ int first, second;
+ float dist;
+ map<int, float> rowMap;
+ map<int, float>::iterator itRow;
+
+ //get first currentRow
+ in >> first;
+ currentRow = first;
+
+ string firstString = toString(first);
+ for(int k = 0; k < firstString.length(); k++) { in.putback(firstString[k]); }
+
+ while(!in.eof()) {
+
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(distFile); m->mothurRemove(tempOutFile); m->mothurRemove(outfile); delete reading; return 0; }
+
+ in >> first >> second >> dist; m->gobble(in);
+
+ if (first != currentRow) {
+ //save position in file of each new row
+ rowPos[currentRow] = out.tellp();
+
+ out << currentRow << '\t' << rowMap.size() << '\t';
+
+ for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) {
+ out << itRow->first << '\t' << itRow->second << '\t';
+ }
+ out << endl;
+
+ currentRow = first;
+ rowMap.clear();
+
+ //save row you just read
+ if (dist < cutoff) {
+ rowMap[second] = dist;
+ }
+ }else{
+ if (dist < cutoff) {
+ rowMap[second] = dist;
+ }
+ }
+ }
+
+ //print last Row
+ //save position in file of each new row
+ rowPos[currentRow] = out.tellp();
+
+ out << currentRow << '\t' << rowMap.size() << '\t';
+
+ for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) {
+ out << itRow->first << '\t' << itRow->second << '\t';
+ }
+ out << endl;
+
+
+ in.close();
+ out.close();
+
+ if (m->control_pressed) { m->mothurRemove(distFile); m->mothurRemove(tempOutFile); m->mothurRemove(outfile); delete reading; return 0; }
+
+ m->mothurRemove(tempOutFile);
+ m->mothurRemove(outfile);
+
+ reading->finish();
+
+ delete reading;
+ list->setLabel("0");
+
+ if (m->control_pressed) { m->mothurRemove(distFile); return 0; }
+
+ return 1;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "FormatColumnMatrix", "read");
+ exit(1);
+ }
+}
/***********************************************************************/
FormatColumnMatrix::~FormatColumnMatrix(){}
FormatColumnMatrix(string);
~FormatColumnMatrix();
int read(NameAssignment*);
+ int read(CountTable*);
private:
ifstream fileHandle;
#include "mothur.h"
#include "listvector.hpp"
#include "nameassignment.hpp"
+#include "counttable.h"
//**********************************************************************************************************************
virtual ~FormatMatrix() {}
virtual int read(NameAssignment*){ return 1; }
+ virtual int read(CountTable*){ return 1; }
void setCutoff(float c) { cutoff = c; }
ListVector* getListVector() { return list; }
if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
else { convert(numTest, nseqs); }
-
- list = new ListVector(nseqs);
- list->set(0, name);
+ if(nameMap == NULL){
+ list = new ListVector(nseqs);
+ list->set(0, name);
+ }
+ else{
+ list = new ListVector(nameMap->getListVector());
+ if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
+ }
char d;
while((d=fileHandle.get()) != EOF){
fileHandle >> name;
- list->set(i, name);
+ if(nameMap == NULL){ list->set(i, name); }
+ else { if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
+ }
for(int j=0;j<i;j++){
for(int i=0;i<nseqs;i++){
fileHandle >> name;
- list->set(i, name);
+ if(nameMap == NULL){ list->set(i, name); }
+ else { if(nameMap->count(name)==0){ m->mothurOut("Error: Sequence '" + name + "' was not found in the names file, please correct"); m->mothurOutEndLine(); }
+ }
for(int j=0;j<nseqs;j++){
if (m->control_pressed) { fileHandle.close(); out.close(); m->mothurRemove(distFile); delete reading; return 0; }
exit(1);
}
}
+/***********************************************************************/
+//not using nameMap
+int FormatPhylipMatrix::read(CountTable* nameMap){
+ try {
+
+ float distance;
+ int square, nseqs;
+ string name;
+ ofstream out;
+
+ string numTest;
+ fileHandle >> numTest >> name;
+
+ if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
+ else { convert(numTest, nseqs); }
+
+ if(nameMap == NULL){
+ list = new ListVector(nseqs);
+ list->set(0, name);
+ }
+ else{
+ list = new ListVector(nameMap->getListVector());
+ nameMap->get(name);
+ }
+
+ char d;
+ while((d=fileHandle.get()) != EOF){
+
+ if(isalnum(d)){ //you are square
+ square = 1;
+ fileHandle.close(); //reset file
+
+ //open and get through numSeqs, code below formats rest of file
+ m->openInputFile(filename, fileHandle);
+ fileHandle >> nseqs; m->gobble(fileHandle);
+
+ distFile = filename + ".rowFormatted";
+ m->openOutputFile(distFile, out);
+ break;
+ }
+ if(d == '\n'){
+ square = 0;
+ break;
+ }
+ }
+
+ Progress* reading;
+ reading = new Progress("Formatting matrix: ", nseqs * nseqs);
+
+ //lower triangle, so must go to column then formatted row file
+ if(square == 0){
+ int index = 0;
+
+ ofstream outTemp;
+ string tempFile = filename + ".temp";
+ m->openOutputFile(tempFile, outTemp);
+
+ //convert to square column matrix
+ for(int i=1;i<nseqs;i++){
+
+ fileHandle >> name;
+
+ if(nameMap == NULL){ list->set(i, name); }
+ else { nameMap->get(name); }
+
+
+ for(int j=0;j<i;j++){
+
+ if (m->control_pressed) { outTemp.close(); m->mothurRemove(tempFile); fileHandle.close(); delete reading; return 0; }
+
+ fileHandle >> distance;
+
+ if (distance == -1) { distance = 1000000; }
+
+ if(distance < cutoff){
+ outTemp << i << '\t' << j << '\t' << distance << endl;
+ outTemp << j << '\t' << i << '\t' << distance << endl;
+ }
+ index++;
+ reading->update(index);
+ }
+ }
+ outTemp.close();
+
+ //format from square column to rowFormatted
+ //sort file by first column so the distances for each row are together
+ string outfile = m->getRootName(tempFile) + "sorted.dist.temp";
+
+ //use the unix sort
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ string command = "sort -n " + tempFile + " -o " + outfile;
+ system(command.c_str());
+#else //sort using windows sort
+ string command = "sort " + tempFile + " /O " + outfile;
+ system(command.c_str());
+#endif
+
+ if (m->control_pressed) { m->mothurRemove(tempFile); m->mothurRemove(outfile); delete reading; return 0; }
+
+ //output to new file distance for each row and save positions in file where new row begins
+ ifstream in;
+ m->openInputFile(outfile, in);
+
+ distFile = outfile + ".rowFormatted";
+ m->openOutputFile(distFile, out);
+
+ rowPos.resize(nseqs, -1);
+ int currentRow;
+ int first, second;
+ float dist;
+ map<int, float> rowMap;
+ map<int, float>::iterator itRow;
+
+ //get first currentRow
+ in >> first;
+ currentRow = first;
+
+ string firstString = toString(first);
+ for(int k = 0; k < firstString.length(); k++) { in.putback(firstString[k]); }
+
+ while(!in.eof()) {
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); m->mothurRemove(distFile); m->mothurRemove(outfile); delete reading; return 0; }
+
+ in >> first >> second >> dist; m->gobble(in);
+
+ if (first != currentRow) {
+ //save position in file of each new row
+ rowPos[currentRow] = out.tellp();
+
+ out << currentRow << '\t' << rowMap.size() << '\t';
+
+ for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) {
+ out << itRow->first << '\t' << itRow->second << '\t';
+ }
+ out << endl;
+
+ currentRow = first;
+ rowMap.clear();
+
+ //save row you just read
+ rowMap[second] = dist;
+
+ index++;
+ reading->update(index);
+ }else{
+ rowMap[second] = dist;
+ }
+ }
+
+ //print last Row
+ //save position in file of each new row
+ rowPos[currentRow] = out.tellp();
+
+ out << currentRow << '\t' << rowMap.size() << '\t';
+
+ for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) {
+ out << itRow->first << '\t' << itRow->second << '\t';
+ }
+ out << endl;
+
+ in.close();
+ out.close();
+
+ m->mothurRemove(tempFile);
+ m->mothurRemove(outfile);
+
+ if (m->control_pressed) { m->mothurRemove(distFile); delete reading; return 0; }
+
+ }
+ else{ //square matrix convert directly to formatted row file
+ int index = nseqs;
+ map<int, float> rowMap;
+ map<int, float>::iterator itRow;
+ rowPos.resize(nseqs, -1);
+
+ for(int i=0;i<nseqs;i++){
+ fileHandle >> name;
+
+ if(nameMap == NULL){ list->set(i, name); }
+ else { nameMap->get(name); }
+
+ for(int j=0;j<nseqs;j++){
+ if (m->control_pressed) { fileHandle.close(); out.close(); m->mothurRemove(distFile); delete reading; return 0; }
+
+ fileHandle >> distance;
+
+ if (distance == -1) { distance = 1000000; }
+
+ if((distance < cutoff) && (j != i)){
+ rowMap[j] = distance;
+ }
+ index++;
+ reading->update(index);
+ }
+
+ m->gobble(fileHandle);
+
+ //save position in file of each new row
+ rowPos[i] = out.tellp();
+
+ //output row to file
+ out << i << '\t' << rowMap.size() << '\t';
+ for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) {
+ out << itRow->first << '\t' << itRow->second << '\t';
+ }
+ out << endl;
+
+ //clear map for new row's info
+ rowMap.clear();
+ }
+ }
+ reading->finish();
+ delete reading;
+ fileHandle.close();
+ out.close();
+
+ if (m->control_pressed) { m->mothurRemove(distFile); return 0; }
+
+ list->setLabel("0");
+
+ return 1;
+
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "FormatPhylipMatrix", "read");
+ exit(1);
+ }
+}
+
/***********************************************************************/
FormatPhylipMatrix::~FormatPhylipMatrix(){}
/***********************************************************************/
FormatPhylipMatrix(string);
~FormatPhylipMatrix();
int read(NameAssignment*);
+ int read(CountTable*);
+
private:
ifstream fileHandle;
string filename;
readMatrix->read(nameMap);
}else if (countfile != "") {
readMatrix->read(&ct);
+ }else {
+ readMatrix->read(nameMap);
}
if (m->control_pressed) { delete readMatrix; return 0; }
if(namefile != ""){
nameMap = new NameAssignment(namefile);
nameMap->readMap();
- readMatrix->read(nameMap);
+ formatMatrix->read(nameMap);
}else if (countfile != "") {
- readMatrix->read(&ct);
+ formatMatrix->read(&ct);
+ }else {
+ formatMatrix->read(nameMap);
}
if (m->control_pressed) { delete formatMatrix; return 0; }
//check for required parameters
listfile = validParameter.validFile(parameters, "list", true);
- if (listfile == "not open") { listfile = ""; abort = true; }
+ if (listfile == "not open") { abort = true; }
else if (listfile == "not found") { listfile = ""; }
else { format = "list"; inputfile = listfile; m->setListFile(listfile); }
sabundfile = validParameter.validFile(parameters, "sabund", true);
- if (sabundfile == "not open") { sabundfile = ""; abort = true; }
+ if (sabundfile == "not open") { abort = true; }
else if (sabundfile == "not found") { sabundfile = ""; }
else { format = "sabund"; inputfile = sabundfile; m->setSabundFile(sabundfile); }
rabundfile = validParameter.validFile(parameters, "rabund", true);
- if (rabundfile == "not open") { rabundfile = ""; abort = true; }
+ if (rabundfile == "not open") { abort = true; }
else if (rabundfile == "not found") { rabundfile = ""; }
else { format = "rabund"; inputfile = rabundfile; m->setRabundFile(rabundfile); }
sharedfile = validParameter.validFile(parameters, "shared", true);
- if (sharedfile == "not open") { sharedfile = ""; abort = true; }
+ if (sharedfile == "not open") { abort = true; }
else if (sharedfile == "not found") { sharedfile = ""; }
else { format = "sharedfile"; inputfile = sharedfile; m->setSharedFile(sharedfile); }
relabundfile = validParameter.validFile(parameters, "relabund", true);
- if (relabundfile == "not open") { relabundfile = ""; abort = true; }
+ if (relabundfile == "not open") { abort = true; }
else if (relabundfile == "not found") { relabundfile = ""; }
else { format = "relabund"; inputfile = relabundfile; m->setRelAbundFile(relabundfile); }
#include "homovacommand.h"
#include "groupmap.h"
#include "readphylipvector.h"
+#include "sharedutilities.h"
//**********************************************************************************************************************
vector<string> HomovaCommand::setParameters(){
try {
CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pdesign);
CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pphylip);
+ CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets);
CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
CommandParameter palpha("alpha", "Number", "", "0.05", "", "", "",false,false); parameters.push_back(palpha);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
string helpString = "";
helpString += "Referenced: Stewart CN, Excoffier L (1996). Assessing population genetic structure and variability with RAPD data: Application to Vaccinium macrocarpon (American Cranberry). J Evol Biol 9: 153-71.\n";
helpString += "The homova command outputs a .homova file. \n";
- helpString += "The homova command parameters are phylip, iters, and alpha. The phylip and design parameters are required, unless valid current files exist.\n";
+ helpString += "The homova command parameters are phylip, iters, sets and alpha. The phylip and design parameters are required, unless valid current files exist.\n";
helpString += "The design parameter allows you to assign your samples to groups when you are running homova. It is required. \n";
helpString += "The design file looks like the group file. It is a 2 column tab delimited file, where the first column is the sample name and the second column is the group the sample belongs to.\n";
+ helpString += "The sets parameter allows you to specify which of the sets in your designfile you would like to analyze. The set names are separated by dashes. THe default is all sets in the designfile.\n";
helpString += "The iters parameter allows you to set number of randomization for the P value. The default is 1000. \n";
helpString += "The homova command should be in the following format: homova(phylip=file.dist, design=file.design).\n";
helpString += "Note: No spaces between parameter labels (i.e. iters), '=' and parameters (i.e. 1000).\n";
temp = validParameter.validFile(parameters, "alpha", false);
if (temp == "not found") { temp = "0.05"; }
m->mothurConvert(temp, experimentwiseAlpha);
+
+ string sets = validParameter.validFile(parameters, "sets", false);
+ if (sets == "not found") { sets = ""; }
+ else {
+ m->splitAtDash(sets, Sets);
+ }
}
}
ReadPhylipVector readMatrix(phylipFileName);
vector<string> sampleNames = readMatrix.read(distanceMatrix);
+ if (Sets.size() != 0) { //user selected sets, so we want to remove the samples not in those sets
+ SharedUtil util;
+ vector<string> dGroups = designMap->getNamesOfGroups();
+ util.setGroups(Sets, dGroups);
+
+ for(int i=0;i<distanceMatrix.size();i++){
+
+ if (m->control_pressed) { delete designMap; return 0; }
+
+ string group = designMap->getGroup(sampleNames[i]);
+
+ if (group == "not found") {
+ m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+ }else if (!m->inUsersGroups(group, Sets)){ //not in set we want remove it
+ //remove from all other rows
+ for(int j=0;j<distanceMatrix.size();j++){
+ distanceMatrix[j].erase(distanceMatrix[j].begin()+i);
+ }
+ distanceMatrix.erase(distanceMatrix.begin()+i);
+ sampleNames.erase(sampleNames.begin()+i);
+ i--;
+ }
+ }
+ }
+
for(int i=0;i<distanceMatrix.size();i++){
for(int j=0;j<i;j++){
distanceMatrix[i][j] *= distanceMatrix[i][j];
map<string, vector<int> > getRandomizedGroups(map<string, vector<int> >);
bool abort;
- vector<string> outputNames;
+ vector<string> outputNames, Sets;
string outputDir, inputDir, designFileName, phylipFileName;
GroupMap* designMap;
#include "makebiomcommand.h"
#include "sharedrabundvector.h"
#include "inputdata.h"
+#include "sharedutilities.h"
//taken from http://biom-format.org/documentation/biom_format.html
/* Minimal Sparse
try {
CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcontaxonomy);
+ CommandParameter pmetadata("metadata", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pmetadata);
CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
string MakeBiomCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The make.biom command parameters are shared, contaxonomy, groups, matrixtype and label. shared is required, unless you have a valid current file.\n";
+ helpString += "The make.biom command parameters are shared, contaxonomy, metadata, groups, matrixtype and label. shared is required, unless you have a valid current file.\n";
helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n";
helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
helpString += "The matrixtype parameter allows you to select what type you would like to make. Choices are sparse and dense, default is sparse.\n";
- helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance. ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled.\n";
+ helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance. ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled. It is used to assign taxonomy information to the metadata of rows.\n";
+ helpString += "The metadata parameter is used to provide experimental parameters to the columns. Things like 'sample1 gut human_gut'. \n";
helpString += "The make.biom command should be in the following format: make.biom(shared=yourShared, groups=yourGroups, label=yourLabels).\n";
helpString += "Example make.biom(shared=abrecovery.an.shared, groups=A-B-C).\n";
helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n";
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["contaxonomy"] = inputDir + it->second; }
}
+
+ it = parameters.find("metadata");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["metadata"] = inputDir + it->second; }
+ }
}
//get shared file
if (contaxonomyfile == "not found") { contaxonomyfile = ""; }
else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
+ metadatafile = validParameter.validFile(parameters, "metadata", true);
+ if (metadatafile == "not found") { metadatafile = ""; }
+ else if (metadatafile == "not open") { metadatafile = ""; abort = true; }
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
labels.insert(lastLabel);
}
+ getSampleMetaData(lookup);
+
//if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
set<string> processedLabels;
set<string> userLabels = labels;
{"id":"Sample6", "metadata":null}
],*/
- string colBack = "\", \"metadata\":null}";
+ string colBack = "\", \"metadata\":";
out << spaces + "\"columns\":[\n";
for (int i = 0; i < lookup.size()-1; i++) {
if (m->control_pressed) { out.close(); return 0; }
- out << rowFront << lookup[i]->getGroup() << colBack << ",\n";
+ out << rowFront << lookup[i]->getGroup() << colBack << sampleMetadata[i] << "},\n";
}
- out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << "\n" + spaces + "],\n";
+ out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << sampleMetadata[lookup.size()-1] << "}\n" + spaces + "],\n";
out << spaces + "\"matrix_type\": \"" << format << "\",\n" + spaces + "\"matrix_element_type\": \"int\",\n";
out << spaces + "\"shape\": [" << m->currentBinLabels.size() << "," << lookup.size() << "],\n";
}
}
+//**********************************************************************************************************************
+int MakeBiomCommand::getSampleMetaData(vector<SharedRAbundVector*>& lookup){
+ try {
+
+ if (metadatafile == "") { for (int i = 0; i < lookup.size(); i++) { sampleMetadata.push_back("null"); } }
+ else {
+ ifstream in;
+ m->openInputFile(metadatafile, in);
+
+ vector<string> groupNames, metadataLabels;
+ map<string, vector<string> > lines;
+
+ string headerLine = m->getline(in); m->gobble(in);
+ vector<string> pieces = m->splitWhiteSpace(headerLine);
+
+ //save names of columns you are reading
+ for (int i = 1; i < pieces.size(); i++) {
+ metadataLabels.push_back(pieces[i]);
+ }
+ int count = metadataLabels.size();
+
+ vector<string> groups = m->getGroups();
+
+ //read rest of file
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); return 0; }
+
+ string group = "";
+ in >> group; m->gobble(in);
+ groupNames.push_back(group);
+
+ string line = m->getline(in); m->gobble(in);
+ vector<string> thisPieces = m->splitWhiteSpaceWithQuotes(line);
+
+ if (thisPieces.size() != count) { m->mothurOut("[ERROR]: expected " + toString(count) + " items of data for sample " + group + " read " + toString(thisPieces.size()) + ", quitting.\n"); }
+ else { if (m->inUsersGroups(group, groups)) { lines[group] = thisPieces; } }
+
+ m->gobble(in);
+ }
+ in.close();
+
+ map<string, vector<string> >::iterator it;
+ for (int i = 0; i < lookup.size(); i++) {
+
+ if (m->control_pressed) { return 0; }
+
+ it = lines.find(lookup[i]->getGroup());
+
+ if (it == lines.end()) { m->mothurOut("[ERROR]: can't find metadata information for " + lookup[i]->getGroup() + ", quitting.\n"); m->control_pressed = true; }
+ else {
+ vector<string> values = it->second;
+
+ string data = "{";
+ for (int j = 0; j < metadataLabels.size()-1; j++) {
+ values[j] = m->removeQuotes(values[j]);
+ data += "\"" + metadataLabels[j] + "\":\"" + values[j] + "\", ";
+ }
+ values[metadataLabels.size()-1] = m->removeQuotes(values[metadataLabels.size()-1]);
+ data += "\"" + metadataLabels[metadataLabels.size()-1] + "\":\"" + values[metadataLabels.size()-1] + "\"}";
+ sampleMetadata.push_back(data);
+ }
+ }
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeBiomCommand", "getSampleMetaData");
+ exit(1);
+ }
+
+}
+
/**************************************************************************************************/
//returns {Bacteria, Bacteroidetes, ..} and scores is filled with {100, 98, ...} or {null, null, null}
vector<string> MakeBiomCommand::parseTax(string tax, vector<string>& scores) {
private:
- string sharedfile, contaxonomyfile, groups, outputDir, format, label;
- vector<string> outputNames, Groups;
+ string sharedfile, contaxonomyfile, metadatafile, groups, outputDir, format, label;
+ vector<string> outputNames, Groups, sampleMetadata;
set<string> labels;
bool abort, allLines;
int getBiom(vector<SharedRAbundVector*>&);
vector<string> getMetaData(vector<SharedRAbundVector*>&);
vector<string> parseTax(string tax, vector<string>& scores);
+ int getSampleMetaData(vector<SharedRAbundVector*>&);
};
CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
- CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
- CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
+// CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
+// CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "",false,false); parameters.push_back(palign);
temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
m->mothurConvert(temp, pdiffs);
- temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
- m->mothurConvert(temp, ldiffs);
+ // temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
+// m->mothurConvert(temp, ldiffs);
- temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
- m->mothurConvert(temp, sdiffs);
+ // temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
+// m->mothurConvert(temp, sdiffs);
temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
m->mothurConvert(temp, tdiffs);
- if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
+ if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } //+ ldiffs + sdiffs;
temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
allFiles = m->isTrue(temp);
if (m->control_pressed) { return 0; }
- string outFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("fasta");
- string outQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("qfile");
+ vector<vector<string> > fastaFileNames;
+ vector<vector<string> > qualFileNames;
+ createGroup = false;
+ string outputGroupFileName;
+ if(oligosfile != ""){
+ createGroup = getOligos(fastaFileNames, qualFileNames);
+ if (createGroup) {
+ outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("group");
+ outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
+ }
+ }
+
+ string outFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "trim." + getOutputFileNameTag("fasta");
+ string outQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "trim." + getOutputFileNameTag("qfile");
+ string outScrapFastaFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "scrap." + getOutputFileNameTag("fasta");
+ string outScrapQualFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + "scrap." + getOutputFileNameTag("qfile");
+
string outMisMatchFile = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + getOutputFileNameTag("mismatch");
outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile);
outputNames.push_back(outQualFile); outputTypes["qfile"].push_back(outQualFile);
+ outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile);
+ outputNames.push_back(outScrapQualFile); outputTypes["qfile"].push_back(outScrapQualFile);
outputNames.push_back(outMisMatchFile); outputTypes["mismatch"].push_back(outMisMatchFile);
m->mothurOut("Making contigs...\n");
- createProcesses(files, outFastaFile, outQualFile, outMisMatchFile);
+ createProcesses(files, outFastaFile, outQualFile, outScrapFastaFile, outScrapQualFile, outMisMatchFile, fastaFileNames, qualFileNames);
m->mothurOut("Done.\n");
//remove temp fasta and qual files
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+ if(allFiles){
+ map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
+ map<string, string>::iterator it;
+ set<string> namesToRemove;
+ for(int i=0;i<fastaFileNames.size();i++){
+ for(int j=0;j<fastaFileNames[0].size();j++){
+ if (fastaFileNames[i][j] != "") {
+ if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
+ if(m->isBlank(fastaFileNames[i][j])){
+ m->mothurRemove(fastaFileNames[i][j]);
+ namesToRemove.insert(fastaFileNames[i][j]);
+
+ m->mothurRemove(qualFileNames[i][j]);
+ namesToRemove.insert(qualFileNames[i][j]);
+ }else{
+ it = uniqueFastaNames.find(fastaFileNames[i][j]);
+ if (it == uniqueFastaNames.end()) {
+ uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
+ }
+ }
+ }
+ }
+ }
+ }
+
+ //remove names for outputFileNames, just cleans up the output
+ vector<string> outputNames2;
+ for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
+ outputNames = outputNames2;
+
+ for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
+ ifstream in;
+ m->openInputFile(it->first, in);
+
+ ofstream out;
+ string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first));
+ thisGroupName += getOutputFileNameTag("group"); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
+ m->openOutputFile(thisGroupName, out);
+
+ while (!in.eof()){
+ if (m->control_pressed) { break; }
+
+ Sequence currSeq(in); m->gobble(in);
+ out << currSeq.getName() << '\t' << it->second << endl;
+ }
+ in.close();
+ out.close();
+ }
+ }
+
+ if (createGroup) {
+ ofstream outGroup;
+ m->openOutputFile(outputGroupFileName, outGroup);
+ for (map<string, string>::iterator itGroup = groupMap.begin(); itGroup != groupMap.end(); itGroup++) {
+ outGroup << itGroup->first << '\t' << itGroup->second << endl;
+ }
+ outGroup.close();
+ }
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process " + toString(numReads) + " sequences.\n");
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //output group counts
+ m->mothurOutEndLine();
+ int total = 0;
+ if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
+ for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
+ total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
+ }
+ if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
string currentFasta = "";
itTypes = outputTypes.find("fasta");
if (itTypes != outputTypes.end()) {
}
}
//**********************************************************************************************************************
-int MakeContigsCommand::createProcesses(vector< vector<string> > files, string outputFasta, string outputQual, string outputMisMatches) {
+int MakeContigsCommand::createProcesses(vector< vector<string> > files, string outputFasta, string outputQual, string outputScrapFasta, string outputScrapQual, string outputMisMatches, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
try {
int num = 0;
vector<int> processIDS;
processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
process++;
}else if (pid == 0){
- num = driver(files[process], outputFasta + toString(getpid()) + ".temp", outputQual + toString(getpid()) + ".temp", outputMisMatches + toString(getpid()) + ".temp");
+ vector<vector<string> > tempFASTAFileNames = fastaFileNames;
+ vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
+
+ if(allFiles){
+ ofstream temp;
+
+ for(int i=0;i<tempFASTAFileNames.size();i++){
+ for(int j=0;j<tempFASTAFileNames[i].size();j++){
+ if (tempFASTAFileNames[i][j] != "") {
+ tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
+ m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
+
+ tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
+ m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
+ }
+ }
+ }
+ }
+
+ num = driver(files[process],
+ outputFasta + toString(getpid()) + ".temp",
+ outputQual + toString(getpid()) + ".temp",
+ outputScrapFasta + toString(getpid()) + ".temp",
+ outputScrapQual + toString(getpid()) + ".temp",
+ outputMisMatches + toString(getpid()) + ".temp",
+ tempFASTAFileNames,
+ tempPrimerQualFileNames);
- //pass numSeqs to parent
- ofstream out;
- string tempFile = outputFasta + toString(getpid()) + ".num.temp";
- m->openOutputFile(tempFile, out);
- out << num << endl;
- out.close();
+ //pass groupCounts to parent
+ ofstream out;
+ string tempFile = toString(getpid()) + ".num.temp";
+ m->openOutputFile(tempFile, out);
+ out << num << endl;
+ if(createGroup){
+ out << groupCounts.size() << endl;
+
+ for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
+ out << it->first << '\t' << it->second << endl;
+ }
+
+ out << groupMap.size() << endl;
+ for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
+ out << it->first << '\t' << it->second << endl;
+ }
+ }
+ out.close();
exit(0);
}else {
}
}
+ ofstream temp;
+ m->openOutputFile(outputFasta, temp); temp.close();
+ m->openOutputFile(outputQual, temp); temp.close();
+ m->openOutputFile(outputScrapFasta, temp); temp.close();
+ m->openOutputFile(outputScrapQual, temp); temp.close();
+
//do my part
- num = driver(files[processors-1], outputFasta, outputQual, outputMisMatches);
+ num = driver(files[processors-1], outputFasta, outputQual, outputScrapFasta, outputScrapQual, outputMisMatches, fastaFileNames, qualFileNames);
//force parent to wait until all the processes are done
for (int i=0;i<processIDS.size();i++) {
}
for (int i = 0; i < processIDS.size(); i++) {
- ifstream in;
- string tempFile = outputFasta + toString(processIDS[i]) + ".num.temp";
- m->openInputFile(tempFile, in);
- if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
- in.close(); m->mothurRemove(tempFile);
+ ifstream in;
+ string tempFile = toString(processIDS[i]) + ".num.temp";
+ m->openInputFile(tempFile, in);
+ int tempNum;
+ in >> tempNum; num += tempNum; m->gobble(in);
+
+ if(createGroup){
+ string group;
+ in >> tempNum; m->gobble(in);
+
+ if (tempNum != 0) {
+ for (int j = 0; j < tempNum; j++) {
+ int groupNum;
+ in >> group >> groupNum; m->gobble(in);
+
+ map<string, int>::iterator it = groupCounts.find(group);
+ if (it == groupCounts.end()) { groupCounts[group] = groupNum; }
+ else { groupCounts[it->first] += groupNum; }
+ }
+ }
+ in >> tempNum; m->gobble(in);
+ if (tempNum != 0) {
+ for (int j = 0; j < tempNum; j++) {
+ string group, seqName;
+ in >> seqName >> group; m->gobble(in);
+
+ map<string, string>::iterator it = groupMap.find(seqName);
+ if (it == groupMap.end()) { groupMap[seqName] = group; }
+ else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
+ }
+ }
+ }
+ in.close(); m->mothurRemove(tempFile);
}
#else
HANDLE hThreadArray[processors-1];
//Create processor worker threads.
- for( int i=0; i<processors-1; i++ ){
- string extension = toString(i) + ".temp";
-
- contigsData* tempcontig = new contigsData(files[i], (outputFasta + extension), (outputQual + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, threshold, i);
+ for( int h=0; h<processors-1; h++ ){
+ string extension = "";
+ if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); }
+ vector<vector<string> > tempFASTAFileNames = fastaFileNames;
+ vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
+
+ if(allFiles){
+ ofstream temp;
+
+ for(int i=0;i<tempFASTAFileNames.size();i++){
+ for(int j=0;j<tempFASTAFileNames[i].size();j++){
+ if (tempFASTAFileNames[i][j] != "") {
+ tempFASTAFileNames[i][j] += extension;
+ m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
+
+
+ tempPrimerQualFileNames[i][j] += extension;
+ m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
+ }
+ }
+ }
+ }
+
+
+ contigsData* tempcontig = new contigsData(files[h], (outputFasta + extension), (outputQual + extension), (outputScrapFasta + extension), (outputScrapQual + extension),(outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, threshold, barcodes, primers, tempFASTAFileNames, tempPrimerQualFileNames, barcodeNameVector, primerNameVector, pdiffs, bdiffs, tdiffs, createGroup, allFiles, h);
pDataArray.push_back(tempcontig);
- processIDS.push_back(i);
- hThreadArray[i] = CreateThread(NULL, 0, MyContigsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ hThreadArray[h] = CreateThread(NULL, 0, MyContigsThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]);
}
+
+ vector<vector<string> > tempFASTAFileNames = fastaFileNames;
+ vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
+
+ if(allFiles){
+ ofstream temp;
+ string extension = toString(processors-1) + ".temp";
+
+ for(int i=0;i<tempFASTAFileNames.size();i++){
+ for(int j=0;j<tempFASTAFileNames[i].size();j++){
+ if (tempFASTAFileNames[i][j] != "") {
+ tempFASTAFileNames[i][j] += extension;
+ m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
+
+
+ tempPrimerQualFileNames[i][j] += extension;
+ m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
+ }
+ }
+ }
+ }
+
+ //parent do my part
+ ofstream temp;
+ m->openOutputFile(outputFasta, temp); temp.close();
+ m->openOutputFile(outputQual, temp); temp.close();
+ m->openOutputFile(outputScrapFasta, temp); temp.close();
+ m->openOutputFile(outputScrapQual, temp); temp.close();
- num = driver(files[processors-1], outputFasta, outputQual, outputMisMatches);
+
+ //do my part
+ num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputQual+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputScrapQual+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames);
//Wait until all threads have terminated.
WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
+ map<string, int>::iterator it2 = groupCounts.find(it->first);
+ if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
+ else { groupCounts[it->first] += it->second; }
+ }
+ for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
+ map<string, string>::iterator it2 = groupMap.find(it->first);
+ if (it2 == groupMap.end()) { groupMap[it->first] = it->second; }
+ else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
+ }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
m->appendFiles((outputQual + toString(processIDS[i]) + ".temp"), outputQual);
m->mothurRemove((outputQual + toString(processIDS[i]) + ".temp"));
+ m->appendFiles((outputScrapFasta + toString(processIDS[i]) + ".temp"), outputScrapFasta);
+ m->mothurRemove((outputScrapFasta + toString(processIDS[i]) + ".temp"));
+
+ m->appendFiles((outputScrapQual + toString(processIDS[i]) + ".temp"), outputScrapQual);
+ m->mothurRemove((outputScrapQual + toString(processIDS[i]) + ".temp"));
+
m->appendFiles((outputMisMatches + toString(processIDS[i]) + ".temp"), outputMisMatches);
m->mothurRemove((outputMisMatches + toString(processIDS[i]) + ".temp"));
+
+ if(allFiles){
+ for(int j=0;j<fastaFileNames.size();j++){
+ for(int k=0;k<fastaFileNames[j].size();k++){
+ if (fastaFileNames[j][k] != "") {
+ m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
+ m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
+
+ m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
+ m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
+ }
+ }
+ }
+ }
}
return num;
}
}
//**********************************************************************************************************************
-int MakeContigsCommand::driver(vector<string> files, string outputFasta, string outputQual, string outputMisMatches){
+int MakeContigsCommand::driver(vector<string> files, string outputFasta, string outputQual, string outputScrapFasta, string outputScrapQual, string outputMisMatches, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames){
try {
Alignment* alignment;
m->openInputFile(thisrfastafile, inRFasta);
m->openInputFile(thisrqualfile, inRQual);
- ofstream outFasta, outQual, outMisMatch;
+ ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual;
m->openOutputFile(outputFasta, outFasta);
m->openOutputFile(outputQual, outQual);
+ m->openOutputFile(outputScrapFasta, outScrapFasta);
+ m->openOutputFile(outputScrapQual, outScrapQual);
m->openOutputFile(outputMisMatches, outMisMatch);
outMisMatch << "Name\tLength\tMisMatches\n";
+ TrimOligos trimOligos(pdiffs, bdiffs, 0, 0, primers, barcodes);
+
while ((!inFQual.eof()) && (!inFFasta.eof()) && (!inRFasta.eof()) && (!inRQual.eof())) {
if (m->control_pressed) { break; }
+ int success = 1;
+ string trashCode = "";
+ int currentSeqsDiffs = 0;
+
//read seqs and quality info
Sequence fSeq(inFFasta); m->gobble(inFFasta);
Sequence rSeq(inRFasta); m->gobble(inRFasta);
QualityScores fQual(inFQual); m->gobble(inFQual);
QualityScores rQual(inRQual); m->gobble(inRQual);
+ int barcodeIndex = 0;
+ int primerIndex = 0;
+
+ if(barcodes.size() != 0){
+ success = trimOligos.stripBarcode(fSeq, rSeq, fQual, rQual, barcodeIndex);
+ if(success > bdiffs) { trashCode += 'b'; }
+ else{ currentSeqsDiffs += success; }
+ }
+
+ if(primers.size() != 0){
+ success = trimOligos.stripForward(fSeq, rSeq, fQual, rQual, primerIndex);
+ if(success > pdiffs) { trashCode += 'f'; }
+ else{ currentSeqsDiffs += success; }
+ }
+
+ if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
+
//flip the reverse reads
rSeq.reverseComplement();
rQual.flipQScores();
-
+
//pairwise align
alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
fSeq.setAligned(alignment->getSeqAAln());
rSeq.setAligned(alignment->getSeqBAln());
int length = fSeq.getAligned().length();
-
+
//traverse alignments merging into one contiguous seq
string contig = "";
vector<int> contigScores;
string seq2 = rSeq.getAligned();
vector<int> scores1 = fQual.getQualityScores();
vector<int> scores2 = rQual.getQualityScores();
-
- // if (num < 5) { cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; }
+
+ // if (num < 5) { cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; }
int overlapStart = fSeq.getStartPos();
int seq2Start = rSeq.getStartPos();
//bigger of the 2 starting positions is the location of the overlapping start
contig += seq1[i];
contigScores.push_back(scores1[ABaseMap[i]]);
}
+
+ }
+ if(trashCode.length() == 0){
+ if (createGroup) {
+ if(barcodes.size() != 0){
+ string thisGroup = barcodeNameVector[barcodeIndex];
+ if (primers.size() != 0) {
+ if (primerNameVector[primerIndex] != "") {
+ if(thisGroup != "") {
+ thisGroup += "." + primerNameVector[primerIndex];
+ }else {
+ thisGroup = primerNameVector[primerIndex];
+ }
+ }
+ }
+
+ if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
+
+ groupMap[fSeq.getName()] = thisGroup;
+
+ map<string, int>::iterator it = groupCounts.find(thisGroup);
+ if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
+ else { groupCounts[it->first] ++; }
+
+ }
+ }
+
+ if(allFiles){
+ ofstream output;
+ m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
+ output << ">" << fSeq.getName() << endl << contig << endl;
+ output.close();
+
+ m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
+ output << ">" << fSeq.getName() << endl;
+ for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; }
+ output << endl;
+ output.close();
+ }
+
+ //output
+ outFasta << ">" << fSeq.getName() << endl << contig << endl;
+ outQual << ">" << fSeq.getName() << endl;
+ for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
+ outQual << endl;
+ outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
+ }else {
+ //output
+ outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
+ outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl;
+ for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; }
+ outScrapQual << endl;
}
- //if (num < 5) { cout << overlapStart << '\t' << overlapEnd << endl << seq1 << endl << seq2 << endl<< contig << endl; }
- //output
- outFasta << ">" << fSeq.getName() << endl << contig << endl;
- outQual << ">" << fSeq.getName() << endl;
- for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
- outQual << endl;
- outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
-
num++;
//report progress
inRQual.close();
outFasta.close();
outQual.close();
+ outScrapFasta.close();
+ outScrapQual.close();
outMisMatch.close();
delete alignment;
- if (m->control_pressed) { m->mothurRemove(outputQual); m->mothurRemove(outputFasta); m->mothurRemove(outputMisMatches);}
+ if (m->control_pressed) { m->mothurRemove(outputQual); m->mothurRemove(outputFasta); m->mothurRemove(outputScrapQual); m->mothurRemove(outputScrapFasta);m->mothurRemove(outputMisMatches);}
return num;
}
m->openInputFile(rfastqfile, inReverse);
count = 0;
- while ((!inForward.eof()) && (!inReverse.eof())) {
+ map<string, fastqRead> uniques;
+ map<string, fastqRead>::iterator itUniques;
+ while ((!inForward.eof()) || (!inReverse.eof())) {
if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; }
//get a read from forward and reverse fastq files
- fastqRead fread = readFastq(inForward);
- fastqRead rread = readFastq(inReverse);
- checkReads(fread, rread);
-
- if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; }
-
- //if the reads are okay write to output files
- int process = count % processors;
+ bool ignoref, ignorer;
+ fastqRead thisFread, thisRread;
+ if (!inForward.eof()) { thisFread = readFastq(inForward, ignoref); }
+ else { ignoref = true; }
+ if (!inReverse.eof()) { thisRread = readFastq(inReverse, ignorer); }
+ else { ignorer = true; }
- *(tempfiles[process][0]) << ">" << fread.name << endl << fread.sequence << endl;
- *(tempfiles[process][1]) << ">" << fread.name << endl;
- for (int i = 0; i < fread.scores.size(); i++) { *(tempfiles[process][1]) << fread.scores[i] << " "; }
- *(tempfiles[process][1]) << endl;
- *(tempfiles[process][2]) << ">" << rread.name << endl << rread.sequence << endl;
- *(tempfiles[process][3]) << ">" << rread.name << endl;
- for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; }
- *(tempfiles[process][3]) << endl;
+ vector<pairFastqRead> reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques);
- count++;
-
- //report progress
- if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
-
+ for (int i = 0; i < reads.size(); i++) {
+ fastqRead fread = reads[i].forward;
+ fastqRead rread = reads[i].reverse;
+
+ if (checkReads(fread, rread)) {
+ if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; }
+
+ //if the reads are okay write to output files
+ int process = count % processors;
+
+ *(tempfiles[process][0]) << ">" << fread.name << endl << fread.sequence << endl;
+ *(tempfiles[process][1]) << ">" << fread.name << endl;
+ for (int i = 0; i < fread.scores.size(); i++) { *(tempfiles[process][1]) << fread.scores[i] << " "; }
+ *(tempfiles[process][1]) << endl;
+ *(tempfiles[process][2]) << ">" << rread.name << endl << rread.sequence << endl;
+ *(tempfiles[process][3]) << ">" << rread.name << endl;
+ for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; }
+ *(tempfiles[process][3]) << endl;
+
+ count++;
+
+ //report progress
+ if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
+ }
+ }
}
//report progress
if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
-
+ if (uniques.size() != 0) {
+ for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) {
+ m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n");
+ }
+ m->mothurOutEndLine();
+ }
//close files, delete ofstreams
for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
inReverse.close();
//adjust for really large processors or really small files
+ if (count == 0) { m->mothurOut("[ERROR]: no good reads.\n"); m->control_pressed = true; }
if (count < processors) {
for (int i = count; i < processors; i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } files[i].clear(); }
files.resize(count);
}
}
//**********************************************************************************************************************
-fastqRead MakeContigsCommand::readFastq(ifstream& in){
+vector<pairFastqRead> MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques){
+ try {
+ vector<pairFastqRead> reads;
+ map<string, fastqRead>::iterator itUniques;
+
+ if (!ignoref && !ignorer) {
+ if (forward.name == reverse.name) {
+ pairFastqRead temp(forward, reverse);
+ reads.push_back(temp);
+ }else {
+ //look for forward pair
+ itUniques = uniques.find(forward.name);
+ if (itUniques != uniques.end()) { //we have the pair for this read
+ pairFastqRead temp(forward, itUniques->second);
+ reads.push_back(temp);
+ uniques.erase(itUniques);
+ }else { //save this read for later
+ uniques[forward.name] = forward;
+ }
+
+ //look for reverse pair
+ itUniques = uniques.find(reverse.name);
+ if (itUniques != uniques.end()) { //we have the pair for this read
+ pairFastqRead temp(itUniques->second, reverse);
+ reads.push_back(temp);
+ uniques.erase(itUniques);
+ }else { //save this read for later
+ uniques[reverse.name] = reverse;
+ }
+ }
+ }else if (!ignoref && ignorer) { //ignore reverse keep forward
+ //look for forward pair
+ itUniques = uniques.find(forward.name);
+ if (itUniques != uniques.end()) { //we have the pair for this read
+ pairFastqRead temp(forward, itUniques->second);
+ reads.push_back(temp);
+ uniques.erase(itUniques);
+ }else { //save this read for later
+ uniques[forward.name] = forward;
+ }
+
+ }else if (ignoref && !ignorer) { //ignore forward keep reverse
+ //look for reverse pair
+ itUniques = uniques.find(reverse.name);
+ if (itUniques != uniques.end()) { //we have the pair for this read
+ pairFastqRead temp(itUniques->second, reverse);
+ reads.push_back(temp);
+ uniques.erase(itUniques);
+ }else { //save this read for later
+ uniques[reverse.name] = reverse;
+ }
+ }//else ignore both and do nothing
+
+ return reads;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeContigsCommand", "readFastqFiles");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){
try {
fastqRead read;
+ ignore = false;
+
//read sequence name
- string name = m->getline(in); m->gobble(in);
- if (name == "") { m->mothurOut("[ERROR]: Blank fasta name."); m->mothurOutEndLine(); m->control_pressed = true; return read; }
- else if (name[0] != '@') { m->mothurOut("[ERROR]: reading " + name + " expected a name with @ as a leading character."); m->mothurOutEndLine(); m->control_pressed = true; return read; }
+ string line = m->getline(in); m->gobble(in);
+ vector<string> pieces = m->splitWhiteSpace(line);
+ string name = ""; if (pieces.size() != 0) { name = pieces[0]; }
+ if (name == "") { m->mothurOut("[WARNING]: Blank fasta name, ignoring read."); m->mothurOutEndLine(); ignore=true; }
+ else if (name[0] != '@') { m->mothurOut("[WARNING]: reading " + name + " expected a name with @ as a leading character, ignoring read."); m->mothurOutEndLine(); ignore=true; }
else { name = name.substr(1); }
//read sequence
string sequence = m->getline(in); m->gobble(in);
- if (sequence == "") { m->mothurOut("[ERROR]: missing sequence for " + name); m->mothurOutEndLine(); m->control_pressed = true; return read; }
+ if (sequence == "") { m->mothurOut("[WARNING]: missing sequence for " + name + ", ignoring."); ignore=true; }
//read sequence name
- string name2 = m->getline(in); m->gobble(in);
- if (name2 == "") { m->mothurOut("[ERROR]: Blank quality name."); m->mothurOutEndLine(); m->control_pressed = true; return read; }
- else if (name2[0] != '+') { m->mothurOut("[ERROR]: reading " + name2 + " expected a name with + as a leading character."); m->mothurOutEndLine(); m->control_pressed = true; return read; }
- else { name2 = name2.substr(1); }
+ line = m->getline(in); m->gobble(in);
+ pieces = m->splitWhiteSpace(line);
+ string name2 = ""; if (pieces.size() != 0) { name2 = pieces[0]; }
+ if (name2 == "") { m->mothurOut("[WARNING]: expected a name with + as a leading character, ignoring."); ignore=true; }
+ else if (name2[0] != '+') { m->mothurOut("[WARNING]: reading " + name2 + " expected a name with + as a leading character, ignoring."); ignore=true; }
+ else { name2 = name2.substr(1); if (name2 == "") { name2 = name; } }
//read quality scores
string quality = m->getline(in); m->gobble(in);
- if (quality == "") { m->mothurOut("[ERROR]: missing quality for " + name2); m->mothurOutEndLine(); m->control_pressed = true; return read; }
-
+ if (quality == "") { m->mothurOut("[WARNING]: missing quality for " + name2 + ", ignoring."); ignore=true; }
+
//sanity check sequence length and number of quality scores match
- if (name2 != "") { if (name != name2) { m->mothurOut("[ERROR]: names do not match. read " + name + " for fasta and " + name2 + " for quality."); m->mothurOutEndLine(); m->control_pressed = true; return read; } }
- if (quality.length() != sequence.length()) { m->mothurOut("[ERROR]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores."); m->mothurOutEndLine(); m->control_pressed = true; return read; }
+ if (name2 != "") { if (name != name2) { m->mothurOut("[WARNING]: names do not match. read " + name + " for fasta and " + name2 + " for quality, ignoring."); ignore=true; } }
+ if (quality.length() != sequence.length()) { m->mothurOut("[WARNING]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores, ignoring read."); ignore=true; }
vector<int> qualScores;
- int controlChar = int('@');
+ int controlChar = int('!');
for (int i = 0; i < quality.length(); i++) {
int temp = int(quality[i]);
temp -= controlChar;
qualScores.push_back(temp);
}
-
+
read.name = name;
read.sequence = sequence;
read.scores = qualScores;
try {
bool good = true;
- //fix names
- if ((forward.name.length() > 2) && (reverse.name.length() > 2)) {
- forward.name = forward.name.substr(0, forward.name.length()-2);
- reverse.name = reverse.name.substr(0, reverse.name.length()-2);
- }else { good = false; m->control_pressed = true; }
-
- //do names match
- if (forward.name != reverse.name) {
- m->mothurOut("[ERROR]: read " + forward.name + " from " + ffastqfile + ", but read " + reverse.name + " from " + rfastqfile + ".\n");
- good = false; m->control_pressed = true;
- }
-
//do sequence lengths match
if (forward.sequence.length() != reverse.sequence.length()) {
- m->mothurOut("[ERROR]: For sequence " + forward.name + " I read a sequence of length " + toString(forward.sequence.length()) + " from " + ffastqfile + ", but read a sequence of length " + toString(reverse.sequence.length()) + " from " + rfastqfile + ".\n");
- good = false; m->control_pressed = true;
+ m->mothurOut("[WARNING]: For sequence " + forward.name + " I read a sequence of length " + toString(forward.sequence.length()) + " from " + ffastqfile + ", but read a sequence of length " + toString(reverse.sequence.length()) + " from " + rfastqfile + ", ignoring.\n");
+ good = false;
}
//do number of qual scores match
if (forward.scores.size() != reverse.scores.size()) {
- m->mothurOut("[ERROR]: For sequence " + forward.name + " I read " + toString(forward.scores.size()) + " quality scores from " + ffastqfile + ", but read " + toString(reverse.scores.size()) + " quality scores from " + rfastqfile + ".\n");
- good = false; m->control_pressed = true;
+ m->mothurOut("[WARNING]: For sequence " + forward.name + " I read " + toString(forward.scores.size()) + " quality scores from " + ffastqfile + ", but read " + toString(reverse.scores.size()) + " quality scores from " + rfastqfile + ", ignoring.\n");
+ good = false;
}
return good;
}
catch(exception& e) {
- m->errorOut(e, "MakeContigsCommand", "readFastq");
+ m->errorOut(e, "MakeContigsCommand", "checkReads");
exit(1);
}
}
if(foligo[i] == 'U') { foligo[i] = 'T'; }
}
- if(type == "PRIMER"){
+ if(type == "FORWARD"){
m->gobble(in);
in >> roligo;
roligo[i] = toupper(roligo[i]);
if(roligo[i] == 'U') { roligo[i] = 'T'; }
}
- roligo = reverseOligo(roligo);
+ //roligo = reverseOligo(roligo);
group = "";
roligo[i] = toupper(roligo[i]);
if(roligo[i] == 'U') { roligo[i] = 'T'; }
}
- roligo = reverseOligo(roligo);
+ //roligo = reverseOligo(roligo);
oligosPair newPair(foligo, roligo);
barcodeNameVector.push_back(group);
}else if(type == "LINKER"){
linker.push_back(foligo);
+ m->mothurOut("[WARNING]: make.contigs is not setup to remove linkers, ignoring.\n");
}else if(type == "SPACER"){
spacer.push_back(foligo);
+ m->mothurOut("[WARNING]: make.contigs is not setup to remove spacers, ignoring.\n");
}
else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are primer, barcode, linker and spacer. Ignoring " + foligo + "."); m->mothurOutEndLine(); }
}
~fastqRead() {};
};
+struct pairFastqRead {
+ fastqRead forward;
+ fastqRead reverse;
+
+ pairFastqRead() {};
+ pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
+ ~pairFastqRead() {};
+};
/**************************************************************************************************/
class MakeContigsCommand : public Command {
void help() { m->mothurOut(getHelpString()); }
private:
- bool abort, allFiles;
+ bool abort, allFiles, createGroup;
string outputDir, ffastqfile, rfastqfile, align, oligosfile;
float match, misMatch, gapOpen, gapExtend;
int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
vector<string> primerNameVector;
vector<string> barcodeNameVector;
- map<string, int> groupCounts;
+ map<string, int> groupCounts;
+ map<string, string> groupMap;
//map<string, int> combos;
//map<string, int> groupToIndex;
//vector<string> groupVector;
- fastqRead readFastq(ifstream&);
+ fastqRead readFastq(ifstream&, bool&);
vector< vector<string> > readFastqFiles(int&);
bool checkReads(fastqRead&, fastqRead&);
- int createProcesses(vector< vector<string> >, string, string, string);
- int driver(vector<string>, string, string, string);
+ int createProcesses(vector< vector<string> >, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
+ int driver(vector<string>, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >);
bool getOligos(vector<vector<string> >&, vector<vector<string> >&);
string reverseOligo(string);
+ vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques);
};
/**************************************************************************************************/
struct contigsData {
string outputFasta;
string outputQual;
+ string outputScrapFasta;
+ string outputScrapQual;
string outputMisMatches;
string align;
vector<string> files;
+ vector<vector<string> > fastaFileNames;
+ vector<vector<string> > qualFileNames;
MothurOut* m;
float match, misMatch, gapOpen, gapExtend;
- int count, threshold, threadID;
+ int count, threshold, threadID, pdiffs, bdiffs, tdiffs;
+ bool allFiles, createGroup;
+ map<string, int> groupCounts;
+ map<string, string> groupMap;
+ vector<string> primerNameVector;
+ vector<string> barcodeNameVector;
+ map<int, oligosPair> barcodes;
+ map<int, oligosPair> primers;
contigsData(){}
- contigsData(vector<string> f, string of, string oq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int tid) {
+ contigsData(vector<string> f, string of, string oq, string osf, string osq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, map<int, oligosPair> br, map<int, oligosPair> pr, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<string>bnv, vector<string> pnv, int pdf, int bdf, int tdf, bool cg, bool all, int tid) {
files = f;
outputFasta = of;
outputQual = oq;
threshold = thr;
align = al;
count = 0;
+ outputScrapFasta = osf;
+ outputScrapQual = osq;
+ fastaFileNames = ffn;
+ qualFileNames = qfn;
+ barcodes = br;
+ primers = pr;
+ barcodeNameVector = bnv;
+ primerNameVector = pnv;
+ pdiffs = pdf;
+ bdiffs = bdf;
+ tdiffs = tdf;
+ allFiles = all;
+ createGroup = cg;
threadID = tid;
}
};
if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
+ if(pDataArray->allFiles){
+ for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
+ for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
+ if (pDataArray->fastaFileNames[i][j] != "") {
+ ofstream temp;
+ pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
+ pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
+ }
+ }
+ }
+ }
+
ifstream inFFasta, inRFasta, inFQual, inRQual;
pDataArray->m->openInputFile(thisffastafile, inFFasta);
pDataArray->m->openInputFile(thisfqualfile, inFQual);
pDataArray->m->openInputFile(thisrfastafile, inRFasta);
pDataArray->m->openInputFile(thisrqualfile, inRQual);
- ofstream outFasta, outQual, outMisMatch;
+ ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual;
pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
pDataArray->m->openOutputFile(pDataArray->outputQual, outQual);
pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
+ pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
+ pDataArray->m->openOutputFile(pDataArray->outputScrapQual, outScrapQual);
outMisMatch << "Name\tLength\tMisMatches\n";
+ TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes);
+
while ((!inFQual.eof()) && (!inFFasta.eof()) && (!inRFasta.eof()) && (!inRQual.eof())) {
if (pDataArray->m->control_pressed) { break; }
+ int success = 1;
+ string trashCode = "";
+ int currentSeqsDiffs = 0;
+
//read seqs and quality info
Sequence fSeq(inFFasta); pDataArray->m->gobble(inFFasta);
Sequence rSeq(inRFasta); pDataArray->m->gobble(inRFasta);
QualityScores fQual(inFQual); pDataArray->m->gobble(inFQual);
QualityScores rQual(inRQual); pDataArray->m->gobble(inRQual);
+ int barcodeIndex = 0;
+ int primerIndex = 0;
+
+ if(pDataArray->barcodes.size() != 0){
+ success = trimOligos.stripBarcode(fSeq, rSeq, fQual, rQual, barcodeIndex);
+ if(success > pDataArray->bdiffs) { trashCode += 'b'; }
+ else{ currentSeqsDiffs += success; }
+ }
+
+ if(pDataArray->primers.size() != 0){
+ success = trimOligos.stripForward(fSeq, rSeq, fQual, rQual, primerIndex);
+ if(success > pDataArray->pdiffs) { trashCode += 'f'; }
+ else{ currentSeqsDiffs += success; }
+ }
+
+ if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
+
//flip the reverse reads
rSeq.reverseComplement();
rQual.flipQScores();
-
+
//pairwise align
alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
int numMismatches = 0;
string seq1 = fSeq.getAligned();
string seq2 = rSeq.getAligned();
-
vector<int> scores1 = fQual.getQualityScores();
vector<int> scores2 = rQual.getQualityScores();
- for (int i = 0; i < length; i++) {
+ int overlapStart = fSeq.getStartPos();
+ int seq2Start = rSeq.getStartPos();
+ //bigger of the 2 starting positions is the location of the overlapping start
+ if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
+ overlapStart = seq2Start;
+ for (int i = 0; i < overlapStart; i++) {
+ contig += seq1[i];
+ contigScores.push_back(scores1[ABaseMap[i]]);
+ }
+ }else { //seq1 starts later so take from 0 to overlapStart from seq2
+ for (int i = 0; i < overlapStart; i++) {
+ contig += seq2[i];
+ contigScores.push_back(scores2[BBaseMap[i]]);
+ }
+ }
+
+ int seq1End = fSeq.getEndPos();
+ int seq2End = rSeq.getEndPos();
+ int overlapEnd = seq1End;
+ if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends
+
+ for (int i = overlapStart; i < overlapEnd; i++) {
if (seq1[i] == seq2[i]) { //match, add base and choose highest score
contig += seq1[i];
contigScores.push_back(scores1[ABaseMap[i]]);
- if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; }
+ if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; }
}else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base
- if (scores2[BBaseMap[i]] >= pDataArray->threshold) {
+ if (scores2[BBaseMap[i]] < pDataArray->threshold) { } //
+ else {
contig += seq2[i];
contigScores.push_back(scores2[BBaseMap[i]]);
}
}else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base
- if (scores1[ABaseMap[i]] >= pDataArray->threshold) {
+ if (scores1[ABaseMap[i]] < pDataArray->threshold) { } //
+ else {
contig += seq1[i];
contigScores.push_back(scores1[ABaseMap[i]]);
}
}else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
char c = seq1[i];
contigScores.push_back(scores1[ABaseMap[i]]);
- if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[i] = scores2[BBaseMap[i]]; c = seq2[i]; }
+ if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; }
contig += c;
numMismatches++;
}else { //should never get here
}
}
- //output
- outFasta << ">" << fSeq.getName() << endl << contig << endl;
- outQual << ">" << fSeq.getName() << endl;
- for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
- outQual << endl;
- outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
-
+ if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
+ for (int i = overlapEnd; i < length; i++) {
+ contig += seq2[i];
+ contigScores.push_back(scores2[BBaseMap[i]]);
+ }
+ }else { //seq2 ends before seq1 so take from overlap to length from seq1
+ for (int i = overlapEnd; i < length; i++) {
+ contig += seq1[i];
+ contigScores.push_back(scores1[ABaseMap[i]]);
+ }
+
+ }
+
+ if(trashCode.length() == 0){
+ if (pDataArray->createGroup) {
+ if(pDataArray->barcodes.size() != 0){
+ string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
+ if (pDataArray->primers.size() != 0) {
+ if (pDataArray->primerNameVector[primerIndex] != "") {
+ if(thisGroup != "") {
+ thisGroup += "." + pDataArray->primerNameVector[primerIndex];
+ }else {
+ thisGroup = pDataArray->primerNameVector[primerIndex];
+ }
+ }
+ }
+
+ if (pDataArray->m->debug) { pDataArray->m->mothurOut(", group= " + thisGroup + "\n"); }
+
+ pDataArray->groupMap[fSeq.getName()] = thisGroup;
+
+ map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
+ if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; }
+ else { pDataArray->groupCounts[it->first] ++; }
+
+ }
+ }
+
+ if(pDataArray->allFiles){
+ ofstream output;
+ pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
+ output << ">" << fSeq.getName() << endl << contig << endl;
+ output.close();
+
+ pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
+ output << ">" << fSeq.getName() << endl;
+ for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; }
+ output << endl;
+ output.close();
+ }
+
+ //output
+ outFasta << ">" << fSeq.getName() << endl << contig << endl;
+ outQual << ">" << fSeq.getName() << endl;
+ for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; }
+ outQual << endl;
+ outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl;
+ }else {
+ //output
+ outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
+ outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl;
+ for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; }
+ outScrapQual << endl;
+ }
num++;
//report progress
outFasta.close();
outQual.close();
outMisMatch.close();
+ outScrapFasta.close();
+ outScrapQual.close();
delete alignment;
- if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputFasta); pDataArray->m->mothurRemove(pDataArray->outputMisMatches);}
+ if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputFasta); pDataArray->m->mothurRemove(pDataArray->outputMisMatches); pDataArray->m->mothurRemove(pDataArray->outputScrapQual); pDataArray->m->mothurRemove(pDataArray->outputScrapFasta);}
return 0;
exit(1);
}
}
+/***********************************************************************/
+vector<string> MothurOut::splitWhiteSpaceWithQuotes(string input){
+ try {
+ vector<string> pieces;
+ string rest = "";
+
+ int pos = input.find('\'');
+ int pos2 = input.find('\"');
+
+ if ((pos == string::npos) && (pos2 == string::npos)) { return splitWhiteSpace(input); } //no quotes to worry about
+ else {
+ for (int i = 0; i < input.length(); i++) {
+ if ((input[i] == '\'') || (input[i] == '\"') || (rest == "\'") || (rest == "\"")) { //grab everything til end or next ' or "
+ rest += input[i];
+ for (int j = i+1; j < input.length(); j++) {
+ if ((input[j] == '\'') || (input[j] == '\"')) { //then quit
+ rest += input[j];
+ i = j+1;
+ j+=input.length();
+ }else { rest += input[j]; }
+ }
+ }else if (!isspace(input[i])) { rest += input[i]; }
+ else {
+ if (rest != "") { pieces.push_back(rest); rest = ""; }
+ while (i < input.length()) { //gobble white space
+ if (isspace(input[i])) { i++; }
+ else { rest = input[i]; break; } //cout << "next piece buffer = " << nextPiece << endl;
+ }
+ }
+ }
+
+ if (rest != "") { pieces.push_back(rest); }
+ }
+ return pieces;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "splitWhiteSpace");
+ exit(1);
+ }
+}
//**********************************************************************************************************************
int MothurOut::readTax(string namefile, map<string, string>& taxMap) {
try {
exit(1);
}
}
+/**************************************************************************************************/
+
+bool MothurOut::inUsersGroups(vector<int> set, vector< vector<int> > sets) {
+ try {
+ for (int i = 0; i < sets.size(); i++) {
+ if (set == sets[i]) { return true; }
+ }
+ return false;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "inUsersGroups");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+bool MothurOut::inUsersGroups(int groupname, vector<int> Groups) {
+ try {
+ for (int i = 0; i < Groups.size(); i++) {
+ if (groupname == Groups[i]) { return true; }
+ }
+ return false;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "inUsersGroups");
+ exit(1);
+ }
+}
+
/**************************************************************************************************/
//returns true if any of the strings in first vector are in second vector
bool MothurOut::inUsersGroups(vector<string> groupnames, vector<string> Groups) {
bool checkReleaseVersion(ifstream&, string);
bool anyLabelsToProcess(string, set<string>&, string);
bool inUsersGroups(vector<string>, vector<string>);
+ bool inUsersGroups(vector<int>, vector< vector<int> >);
bool inUsersGroups(string, vector<string>);
+ bool inUsersGroups(int, vector<int>);
void getNumSeqs(ifstream&, int&);
int getNumSeqs(ifstream&);
int getNumNames(string);
void splitAtDash(string&, vector<string>&);
void splitAtChar(string&, vector<string>&, char);
void splitAtChar(string&, string&, char);
+ vector<string> splitWhiteSpaceWithQuotes(string);
int removeConfidences(string&);
string removeQuotes(string);
string makeList(vector<string>&);
CommandParameter pshared("shared", "InputTypes", "", "", "SharedRelMeta", "SharedRelMeta", "none",false,false); parameters.push_back(pshared);
CommandParameter prelabund("relabund", "InputTypes", "", "", "SharedRelMeta", "SharedRelMeta", "none",false,false); parameters.push_back(prelabund);
CommandParameter pmetadata("metadata", "InputTypes", "", "", "SharedRelMeta", "SharedRelMeta", "none",false,false); parameters.push_back(pmetadata);
+ CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
CommandParameter pmethod("method", "Multiple", "pearson-spearman-kendall", "pearson", "", "", "",false,false); parameters.push_back(pmethod);
string helpString = "";
helpString += "The otu.association command reads a shared or relabund file and calculates the correlation coefficients between otus.\n";
helpString += "If you provide a metadata file, mothur will calculate te correlation bewteen the metadata and the otus.\n";
- helpString += "The otu.association command parameters are shared, relabund, metadata, groups, method and label. The shared or relabund parameter is required.\n";
+ helpString += "The otu.association command parameters are shared, relabund, metadata, groups, method, cutoff and label. The shared or relabund parameter is required.\n";
helpString += "The groups parameter allows you to specify which of the groups you would like included. The group names are separated by dashes.\n";
helpString += "The label parameter allows you to select what distances level you would like used, and are also separated by dashes.\n";
+ helpString += "The cutoff parameter allows you to set a pvalue at which the otu will be reported.\n";
helpString += "The method parameter allows you to select what method you would like to use. Options are pearson, spearman and kendall. Default=pearson.\n";
helpString += "The otu.association command should be in the following format: otu.association(shared=yourSharedFile, method=yourMethod).\n";
helpString += "Example otu.association(shared=genus.pool.shared, method=kendall).\n";
method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "pearson"; }
+ string temp = validParameter.validFile(parameters, "cutoff", false);
+ if (temp == "not found") { temp = "10"; }
+ m->mothurConvert(temp, cutoff);
+
if ((method != "pearson") && (method != "spearman") && (method != "kendall")) { m->mothurOut(method + " is not a valid method. Valid methods are pearson, spearman, and kendall."); m->mothurOutEndLine(); abort = true; }
}
else if (method == "kendall") { coef = linear.calcKendall(xy[i], xy[k], sig); }
else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; }
- out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl;
+ if (sig < cutoff) { out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; }
}
}
}else { //compare otus to metadata
else if (method == "kendall") { coef = linear.calcKendall(xy[i], metadata[k], sig); }
else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; }
- out << m->binLabelsInFile[i] << '\t' << metadataLabels[k] << '\t' << coef << '\t' << sig << endl;
+ if (sig < cutoff) { out << m->binLabelsInFile[i] << '\t' << metadataLabels[k] << '\t' << coef << '\t' << sig << endl; }
}
}
else if (method == "kendall") { coef = linear.calcKendall(xy[i], xy[k], sig); }
else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; }
- out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl;
+ if (sig < cutoff) { out << m->binLabelsInFile[i] << '\t' << m->binLabelsInFile[k] << '\t' << coef << '\t' << sig << endl; }
}
}
}else { //compare otus to metadata
else if (method == "kendall") { coef = linear.calcKendall(xy[i], metadata[k], sig); }
else { m->mothurOut("[ERROR]: invalid method, choices are spearman, pearson or kendall."); m->mothurOutEndLine(); m->control_pressed = true; }
- out << m->binLabelsInFile[i] << '\t' << metadataLabels[k] << '\t' << coef << '\t' << sig << endl;
+ if (sig < cutoff) { out << m->binLabelsInFile[i] << '\t' << metadataLabels[k] << '\t' << coef << '\t' << sig << endl; }
}
}
string sharedfile, relabundfile, metadatafile, groups, label, inputFileName, outputDir, method;
bool abort, pickedGroups, allLines;
+ double cutoff;
set<string> labels;
vector<SharedRAbundFloatVector*> metadataLookup;
vector< vector< double> > metadata;
map<string, string>::iterator itNames = namemap.find(names[i]); //make sure this name is in namefile
if (itNames != namemap.end()) { name += namemap[names[i]] + ","; } //you found it in namefile
- else { m->mothurOut(names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); exit(1); }
+ else { m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
}else{ name += names[i] + ","; }
}
}
+
+ if (m->control_pressed) { break; }
+
name = name.substr(0, name.length()-1); //rip off extra ','
//add bin to list vector
if (name != "") { list.push_back(name); } //caused by unknown
if (m->control_pressed) { in.close(); outSummary.close(); return 1; }
Sequence current(in); m->gobble(in);
-
+
if (current.getName() != "") {
int num = 1;
if(letter == '>'){
fastaFile.putback(letter);
break;
- }
+ }else if (letter == ' ') {;}
else if(isprint(letter)){
letter = toupper(letter);
if(letter == 'U'){letter = 'T';}
if(letter == '>'){
fastaFile.putback(letter);
break;
- }
+ }else if (letter == ' ') {;}
else if(isprint(letter)){
letter = toupper(letter);
if(letter == 'U'){letter = 'T';}
m->saveNextLabel = nextLabel;
m->setAllGroups(allGroups);
- for (int i = 0; i < allGroups.size(); i++) { cout << allGroups[i] << endl; }
-
}
catch(exception& e) {
m->errorOut(e, "SharedRAbundFloatVector", "SharedRAbundFloatVector");
for (int i = 0; i < Groups.size(); i++) {
if (m->inUsersGroups(Groups[i], m->getGroups())) {
if (m->control_pressed) { break; }
-
+ cout << Groups[i] << endl;
int thisSize = ct->getGroupCount(Groups[i]);
if (thisSize >= size) {
revPrimer = r;
linker = lk;
spacer = sp;
+ maxFBarcodeLength = 0;
+ for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+ if(it->first.length() > maxFBarcodeLength){
+ maxFBarcodeLength = it->first.length();
+ }
+ }
+ maxFPrimerLength = 0;
+ map<string,int>::iterator it;
+ for(it=primers.begin();it!=primers.end();it++){
+ if(it->first.length() > maxFPrimerLength){
+ maxFPrimerLength = it->first.length();
+ }
+ }
+
+ maxLinkerLength = 0;
+ for(int i = 0; i < linker.size(); i++){
+ if(linker[i].length() > maxLinkerLength){
+ maxLinkerLength = linker[i].length();
+ }
+ }
+
+ maxSpacerLength = 0;
+ for(int i = 0; i < spacer.size(); i++){
+ if(spacer[i].length() > maxSpacerLength){
+ maxSpacerLength = spacer[i].length();
+ }
+ }
}
catch(exception& e) {
m->errorOut(e, "TrimOligos", "TrimOligos");
}
/********************************************************************/
//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
-TrimOligos::TrimOligos(int p, int b, int l, int s, map<int, oligosPair> pr, map<int, oligosPair> br, vector<string> lk, vector<string> sp){
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<int, oligosPair> pr, map<int, oligosPair> br){
try {
m = MothurOut::getInstance();
ldiffs = l;
sdiffs = s;
- ibarcodes = br;
- iprimers = pr;
- linker = lk;
- spacer = sp;
+ maxFBarcodeLength = 0;
+ for(map<int,oligosPair>::iterator it=br.begin();it!=br.end();it++){
+ string forward = it->second.forward;
+ map<string, vector<int> >::iterator itForward = ifbarcodes.find(forward);
+
+ if(forward.length() > maxFBarcodeLength){ maxFBarcodeLength = forward.length(); }
+
+ if (itForward == ifbarcodes.end()) {
+ vector<int> temp; temp.push_back(it->first);
+ ifbarcodes[forward] = temp;
+ }else { itForward->second.push_back(it->first); }
+ }
+
+ maxFPrimerLength = 0;
+ for(map<int,oligosPair>::iterator it=pr.begin();it!=pr.end();it++){
+ string forward = it->second.forward;
+ map<string, vector<int> >::iterator itForward = ifprimers.find(forward);
+
+ if(forward.length() > maxFPrimerLength){ maxFPrimerLength = forward.length(); }
+
+ if (itForward == ifprimers.end()) {
+ vector<int> temp; temp.push_back(it->first);
+ ifprimers[forward] = temp;
+ }else { itForward->second.push_back(it->first); }
+ }
+
+ maxRBarcodeLength = 0;
+ for(map<int,oligosPair>::iterator it=br.begin();it!=br.end();it++){
+ string forward = it->second.reverse;
+ map<string, vector<int> >::iterator itForward = irbarcodes.find(forward);
+
+ if(forward.length() > maxRBarcodeLength){ maxRBarcodeLength = forward.length(); }
+
+ if (itForward == irbarcodes.end()) {
+ vector<int> temp; temp.push_back(it->first);
+ irbarcodes[forward] = temp;
+ }else { itForward->second.push_back(it->first); }
+ }
+
+ maxRPrimerLength = 0;
+ for(map<int,oligosPair>::iterator it=pr.begin();it!=pr.end();it++){
+ string forward = it->second.reverse;
+ map<string, vector<int> >::iterator itForward = irprimers.find(forward);
+
+ if(forward.length() > maxRPrimerLength){ maxRPrimerLength = forward.length(); }
+
+ if (itForward == irprimers.end()) {
+ vector<int> temp; temp.push_back(it->first);
+ irprimers[forward] = temp;
+ }else { itForward->second.push_back(it->first); }
+ }
+
+ ipbarcodes = br;
+ ipprimers = pr;
}
catch(exception& e) {
m->errorOut(e, "TrimOligos", "TrimOligos");
if ((bdiffs == 0) || (success == 0)) { return success; }
else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
Alignment* alignment;
- if (barcodes.size() > 0) {
- map<string,int>::iterator it;
-
- for(it=barcodes.begin();it!=barcodes.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
-
- }else{ alignment = NULL; }
+ if (barcodes.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
int minDiff = 1e6;
string oligo = it->first;
// int length = oligo.length();
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
success = bdiffs + 10;
break;
}
int success = bdiffs + 1; //guilty until proven innocent
//can you find the forward barcode
- for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+ for(map<int,oligosPair>::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){
string foligo = it->second.forward;
string roligo = it->second.reverse;
//if you found the barcode or if you don't want to allow for diffs
if ((bdiffs == 0) || (success == 0)) { return success; }
else { //try aligning and see if you can find it
-
- //look for forward
- int maxLength = 0;
-
Alignment* alignment;
- if (ibarcodes.size() > 0) {
- for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
- if(it->second.forward.length() > maxLength){ maxLength = it->second.forward.length(); }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
- }else{ alignment = NULL; }
+ if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
int minDiff = 1e6;
int minCount = 1;
- int minFGroup = -1;
- int minFPos = 0;
-
- for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
- string oligo = it->second.forward;
+ vector< vector<int> > minFGroup;
+ vector<int> minFPos;
+
+ //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
+ /*
+ 1 Sarah Westcott
+ 2 John Westcott
+ 3 Anna Westcott
+ 4 Sarah Schloss
+ 5 Pat Schloss
+ 6 Gail Brown
+ 7 Pat Moore
+
+ only want to look for forward = Sarah, John, Anna, Pat, Gail
+ reverse = Westcott, Schloss, Brown, Moore
+
+ but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
+ */
+ //cout << endl << forwardSeq.getName() << endl;
+ for(map<string, vector<int> >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){
+ string oligo = it->first;
- if(rawFSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ if(rawFSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
success = bdiffs + 10;
break;
}
-
+ //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl;
//use needleman to align first barcode.length()+numdiffs of sequence to each barcode
alignment->align(oligo, rawFSequence.substr(0,oligo.length()+bdiffs));
oligo = alignment->getSeqAAln();
string temp = alignment->getSeqBAln();
-
+
int alnLength = oligo.length();
for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
temp = temp.substr(0,alnLength);
int numDiff = countDiffs(oligo, temp);
+ if (alnLength == 0) { numDiff = bdiffs + 100; }
+ //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
+
if(numDiff < minDiff){
minDiff = numDiff;
minCount = 1;
- minFGroup = it->first;
- minFPos = 0;
+ minFGroup.clear();
+ minFGroup.push_back(it->second);
+ int tempminFPos = 0;
+ minFPos.clear();
for(int i=0;i<alnLength;i++){
if(temp[i] != '-'){
- minFPos++;
+ tempminFPos++;
}
}
+ minFPos.push_back(tempminFPos);
}else if(numDiff == minDiff){
- minCount++;
+ minFGroup.push_back(it->second);
+ int tempminFPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ tempminFPos++;
+ }
+ }
+ minFPos.push_back(tempminFPos);
}
-
}
-
+
+ //cout << minDiff << '\t' << minCount << '\t' << endl;
if(minDiff > bdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
else{
//check for reverse match
if (alignment != NULL) { delete alignment; }
- maxLength = 0;
- if (ibarcodes.size() > 0) {
- for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
- if(it->second.reverse.length() > maxLength){ maxLength = it->second.reverse.length(); }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
- }else{ alignment = NULL; }
+ if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
minDiff = 1e6;
minCount = 1;
- int minRGroup = -1;
- int minRPos = 0;
+ vector< vector<int> > minRGroup;
+ vector<int> minRPos;
- for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
- string oligo = it->second.reverse;
-
- if(rawRSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ for(map<string, vector<int> >::iterator it=irbarcodes.begin();it!=irbarcodes.end();it++){
+ string oligo = it->first;
+ //cout << "before = " << oligo << '\t' << rawRSequence.substr(0,oligo.length()+bdiffs) << endl;
+ if(rawRSequence.length() < maxRBarcodeLength){ //let's just assume that the barcodes are the same length
success = bdiffs + 10;
break;
}
//use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawRSequence.substr((rawRSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+ alignment->align(oligo, rawRSequence.substr(0,oligo.length()+bdiffs));
oligo = alignment->getSeqAAln();
string temp = alignment->getSeqBAln();
int alnLength = oligo.length();
- for(int i=0;i<alnLength;i++){ if(oligo[i] != '-'){ alnLength = i; break; } }
+ for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
oligo = oligo.substr(0,alnLength);
temp = temp.substr(0,alnLength);
int numDiff = countDiffs(oligo, temp);
+ if (alnLength == 0) { numDiff = bdiffs + 100; }
+ //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
if(numDiff < minDiff){
minDiff = numDiff;
minCount = 1;
- minRGroup = it->first;
- minRPos = 0;
+ minRGroup.clear();
+ minRGroup.push_back(it->second);
+ int tempminRPos = 0;
+ minRPos.clear();
for(int i=0;i<alnLength;i++){
if(temp[i] != '-'){
- minRPos++;
+ tempminRPos++;
}
}
+ minRPos.push_back(tempminRPos);
}else if(numDiff == minDiff){
- minCount++;
+ int tempminRPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ tempminRPos++;
+ }
+ }
+ minRPos.push_back(tempminRPos);
+ minRGroup.push_back(it->second);
}
}
-
+
+ /*cout << minDiff << '\t' << minCount << '\t' << endl;
+ for (int i = 0; i < minFGroup.size(); i++) {
+ cout << i << '\t';
+ for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
+ cout << endl;
+ }
+ cout << endl;
+ for (int i = 0; i < minRGroup.size(); i++) {
+ cout << i << '\t';
+ for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
+ cout << endl;
+ }
+ cout << endl;*/
if(minDiff > bdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
- else{
+ else {
+ bool foundMatch = false;
+ vector<int> matches;
+ for (int i = 0; i < minFGroup.size(); i++) {
+ for (int j = 0; j < minFGroup[i].size(); j++) {
+ for (int k = 0; k < minRGroup.size(); k++) {
+ if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
+ }
+ }
+ }
+
+ int fStart = 0;
+ int rStart = 0;
+ if (matches.size() == 1) {
+ foundMatch = true;
+ group = matches[0];
+ fStart = minFPos[0];
+ rStart = minRPos[0];
+ }
+
//we have an acceptable match for the forward and reverse, but do they match?
- if (minFGroup == minRGroup) {
- group = minFGroup;
- forwardSeq.setUnaligned(rawFSequence.substr(minFPos));
- reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-minRPos)));
- forwardQual.trimQScores(minFPos, -1);
- reverseQual.trimQScores(-1, rawRSequence.length()-minRPos);
+ if (foundMatch) {
+ forwardSeq.setUnaligned(rawFSequence.substr(fStart));
+ reverseSeq.setUnaligned(rawRSequence.substr(rStart));
+ forwardQual.trimQScores(fStart, -1);
+ reverseQual.trimQScores(rStart, -1);
success = minDiff;
}else { success = bdiffs + 100; }
}
}
//*******************************************************************/
-int TrimOligos::stripBarcode(Sequence& seq, int& group){
+int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){
try {
+ //look for forward barcode
+ string rawFSequence = forwardSeq.getUnaligned();
+ string rawRSequence = reverseSeq.getUnaligned();
+ int success = pdiffs + 1; //guilty until proven innocent
- string rawSequence = seq.getUnaligned();
- int success = bdiffs + 1; //guilty until proven innocent
-
- //can you find the barcode
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- string oligo = it->first;
- if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ //can you find the forward barcode
+ for(map<int,oligosPair>::iterator it=ipprimers.begin();it!=ipprimers.end();it++){
+ string foligo = it->second.forward;
+ string roligo = it->second.reverse;
+
+ if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length
+ success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
+ break;
+ }
+ if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length
+ success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
break;
}
- if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
- group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
-
+ if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
+ group = it->first;
+ forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
+ reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
+ forwardQual.trimQScores(foligo.length(), -1);
+ reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length());
success = 0;
break;
}
}
//if you found the barcode or if you don't want to allow for diffs
- if ((bdiffs == 0) || (success == 0)) { return success; }
-
+ if ((pdiffs == 0) || (success == 0)) { return success; }
else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
Alignment* alignment;
- if (barcodes.size() > 0) {
- map<string,int>::iterator it=barcodes.begin();
-
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
-
- }else{ alignment = NULL; }
+ if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
int minDiff = 1e6;
int minCount = 1;
- int minGroup = -1;
- int minPos = 0;
-
- for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+ vector< vector<int> > minFGroup;
+ vector<int> minFPos;
+
+ //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
+ /*
+ 1 Sarah Westcott
+ 2 John Westcott
+ 3 Anna Westcott
+ 4 Sarah Schloss
+ 5 Pat Schloss
+ 6 Gail Brown
+ 7 Pat Moore
+
+ only want to look for forward = Sarah, John, Anna, Pat, Gail
+ reverse = Westcott, Schloss, Brown, Moore
+
+ but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
+ */
+ //cout << endl << forwardSeq.getName() << endl;
+ for(map<string, vector<int> >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){
string oligo = it->first;
- // int length = oligo.length();
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10;
+ if(rawFSequence.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length
+ success = pdiffs + 10;
break;
}
-
+ //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl;
//use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
+ alignment->align(oligo, rawFSequence.substr(0,oligo.length()+pdiffs));
oligo = alignment->getSeqAAln();
string temp = alignment->getSeqBAln();
-
+
int alnLength = oligo.length();
- for(int i=oligo.length()-1;i>=0;i--){
- if(oligo[i] != '-'){ alnLength = i+1; break; }
- }
+ for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
oligo = oligo.substr(0,alnLength);
temp = temp.substr(0,alnLength);
-
- int numDiff = countDiffs(oligo, temp);
+ int numDiff = countDiffs(oligo, temp);
+ if (alnLength == 0) { numDiff = pdiffs + 100; }
+ //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
+
if(numDiff < minDiff){
minDiff = numDiff;
minCount = 1;
- minGroup = it->second;
- minPos = 0;
+ minFGroup.clear();
+ minFGroup.push_back(it->second);
+ int tempminFPos = 0;
+ minFPos.clear();
for(int i=0;i<alnLength;i++){
if(temp[i] != '-'){
- minPos++;
+ tempminFPos++;
}
}
- }
- else if(numDiff == minDiff){
- minCount++;
- }
-
- }
-
- if(minDiff > bdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
- else{ //use the best match
- group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
- success = minDiff;
- }
-
- if (alignment != NULL) { delete alignment; }
-
- }
-
- return success;
-
- }
- catch(exception& e) {
- m->errorOut(e, "TrimOligos", "stripBarcode");
- exit(1);
- }
-
-}
-/*******************************************************************
-int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){
- try {
-
- string rawSequence = seq.getUnaligned();
- int success = bdiffs + 1; //guilty until proven innocent
-
- //can you find the barcode
- for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
- string oligo = it->first;
- if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
- break;
- }
-
- if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){
- group = it->second;
- seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length())));
-
- if(qual.getName() != ""){
- qual.trimQScores(-1, rawSequence.length()-oligo.length());
- }
-
- success = 0;
- break;
- }
- }
-
- //if you found the barcode or if you don't want to allow for diffs
- if ((bdiffs == 0) || (success == 0)) { return success; }
-
- else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
- Alignment* alignment;
- if (rbarcodes.size() > 0) {
- map<string,int>::iterator it;
-
- for(it=rbarcodes.begin();it!=rbarcodes.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
-
- }else{ alignment = NULL; }
-
- //can you find the barcode
- int minDiff = 1e6;
- int minCount = 1;
- int minGroup = -1;
- int minPos = 0;
-
- for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
- string oligo = it->first;
- // int length = oligo.length();
-
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
- success = bdiffs + 10;
- break;
- }
-
- //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
- oligo = alignment->getSeqAAln();
- string temp = alignment->getSeqBAln();
-
- int alnLength = oligo.length();
-
- for(int i=0;i<alnLength;i++){
- if(oligo[i] != '-'){ alnLength = i; break; }
- }
- oligo = oligo.substr(alnLength);
- temp = temp.substr(alnLength);
-
- int numDiff = countDiffs(oligo, temp);
-
- if(numDiff < minDiff){
- minDiff = numDiff;
- minCount = 1;
- minGroup = it->second;
- minPos = 0;
- for(int i=alnLength-1;i>=0;i--){
+ minFPos.push_back(tempminFPos);
+ }else if(numDiff == minDiff){
+ minFGroup.push_back(it->second);
+ int tempminFPos = 0;
+ for(int i=0;i<alnLength;i++){
if(temp[i] != '-'){
- minPos++;
+ tempminFPos++;
}
}
+ minFPos.push_back(tempminFPos);
}
- else if(numDiff == minDiff){
- minCount++;
- }
-
}
-
- if(minDiff > bdiffs) { success = minDiff; } //no good matches
- else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
- else{ //use the best match
- group = minGroup;
- seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos)));
+
+ //cout << minDiff << '\t' << minCount << '\t' << endl;
+ if(minDiff > pdiffs) { success = minDiff; } //no good matches
+ else{
+ //check for reverse match
+ if (alignment != NULL) { delete alignment; }
- if(qual.getName() != ""){
- qual.trimQScores(-1, (rawSequence.length()-minPos));
- }
- success = minDiff;
+ if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); }
+ else{ alignment = NULL; }
+
+ //can you find the barcode
+ minDiff = 1e6;
+ minCount = 1;
+ vector< vector<int> > minRGroup;
+ vector<int> minRPos;
+
+ for(map<string, vector<int> >::iterator it=irprimers.begin();it!=irprimers.end();it++){
+ string oligo = it->first;
+ //cout << "before = " << oligo << '\t' << rawRSequence.substr(0,oligo.length()+pdiffs) << endl;
+ if(rawRSequence.length() < maxRPrimerLength){ //let's just assume that the barcodes are the same length
+ success = pdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawRSequence.substr(0,oligo.length()+pdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+ for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+ int numDiff = countDiffs(oligo, temp);
+ if (alnLength == 0) { numDiff = pdiffs + 100; }
+
+ //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minRGroup.clear();
+ minRGroup.push_back(it->second);
+ int tempminRPos = 0;
+ minRPos.clear();
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ tempminRPos++;
+ }
+ }
+ minRPos.push_back(tempminRPos);
+ }else if(numDiff == minDiff){
+ int tempminRPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ tempminRPos++;
+ }
+ }
+ minRPos.push_back(tempminRPos);
+ minRGroup.push_back(it->second);
+ }
+
+ }
+
+ /*cout << minDiff << '\t' << minCount << '\t' << endl;
+ for (int i = 0; i < minFGroup.size(); i++) {
+ cout << i << '\t';
+ for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
+ cout << endl;
+ }
+ cout << endl;
+ for (int i = 0; i < minRGroup.size(); i++) {
+ cout << i << '\t';
+ for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
+ cout << endl;
+ }
+ cout << endl;*/
+ if(minDiff > pdiffs) { success = minDiff; } //no good matches
+ else {
+ bool foundMatch = false;
+ vector<int> matches;
+ for (int i = 0; i < minFGroup.size(); i++) {
+ for (int j = 0; j < minFGroup[i].size(); j++) {
+ for (int k = 0; k < minRGroup.size(); k++) {
+ if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
+ }
+ }
+ }
+
+ int fStart = 0;
+ int rStart = 0;
+ if (matches.size() == 1) {
+ foundMatch = true;
+ group = matches[0];
+ fStart = minFPos[0];
+ rStart = minRPos[0];
+ }
+
+ //we have an acceptable match for the forward and reverse, but do they match?
+ if (foundMatch) {
+ forwardSeq.setUnaligned(rawFSequence.substr(fStart));
+ reverseSeq.setUnaligned(rawRSequence.substr(rStart));
+ forwardQual.trimQScores(fStart, -1);
+ reverseQual.trimQScores(rStart, -1);
+ success = minDiff;
+ }else { success = pdiffs + 100; }
+ }
}
if (alignment != NULL) { delete alignment; }
-
}
return success;
}
catch(exception& e) {
- m->errorOut(e, "TrimOligos", "stripRBarcode");
+ m->errorOut(e, "TrimOligos", "stripIForward");
exit(1);
}
}
-/*******************************************************************
-int TrimOligos::stripRBarcode(Sequence& seq, int& group){
+//*******************************************************************/
+int TrimOligos::stripBarcode(Sequence& seq, int& group){
try {
string rawSequence = seq.getUnaligned();
int success = bdiffs + 1; //guilty until proven innocent
//can you find the barcode
- for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
string oligo = it->first;
if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
break;
}
-
- if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){
+
+ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
group = it->second;
- seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length())));
+ seq.setUnaligned(rawSequence.substr(oligo.length()));
success = 0;
break;
if ((bdiffs == 0) || (success == 0)) { return success; }
else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
- Alignment* alignment;
- if (rbarcodes.size() > 0) {
- map<string,int>::iterator it;
-
- for(it=rbarcodes.begin();it!=rbarcodes.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
-
- }else{ alignment = NULL; }
+ Alignment* alignment;
+ if (barcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
int minDiff = 1e6;
int minGroup = -1;
int minPos = 0;
- for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+ for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
string oligo = it->first;
// int length = oligo.length();
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
success = bdiffs + 10;
break;
}
//use needleman to align first barcode.length()+numdiffs of sequence to each barcode
- alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+ alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
oligo = alignment->getSeqAAln();
string temp = alignment->getSeqBAln();
-
+
int alnLength = oligo.length();
- for(int i=0;i<alnLength;i++){
- if(oligo[i] != '-'){ alnLength = i; break; }
+ for(int i=oligo.length()-1;i>=0;i--){
+ if(oligo[i] != '-'){ alnLength = i+1; break; }
}
- oligo = oligo.substr(alnLength);
- temp = temp.substr(alnLength);
-
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+
int numDiff = countDiffs(oligo, temp);
if(numDiff < minDiff){
minCount = 1;
minGroup = it->second;
minPos = 0;
- for(int i=alnLength-1;i>=0;i--){
+ for(int i=0;i<alnLength;i++){
if(temp[i] != '-'){
minPos++;
}
else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
else{ //use the best match
group = minGroup;
- seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos)));
-
+ seq.setUnaligned(rawSequence.substr(minPos));
success = minDiff;
}
}
catch(exception& e) {
- m->errorOut(e, "TrimOligos", "stripRBarcode");
+ m->errorOut(e, "TrimOligos", "stripBarcode");
exit(1);
}
}
-//********************************************************************/
+
+/********************************************************************/
int TrimOligos::stripForward(Sequence& seq, int& group){
try {
if ((pdiffs == 0) || (success == 0)) { return success; }
else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
Alignment* alignment;
- if (primers.size() > 0) {
- map<string,int>::iterator it;
-
- for(it=primers.begin();it!=primers.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
-
- }else{ alignment = NULL; }
+ if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
int minDiff = 1e6;
string oligo = it->first;
// int length = oligo.length();
- if(rawSequence.length() < maxLength){
+ if(rawSequence.length() < maxFPrimerLength){
success = pdiffs + 100;
break;
}
if ((pdiffs == 0) || (success == 0)) { return success; }
else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
Alignment* alignment;
- if (primers.size() > 0) {
- map<string,int>::iterator it;
-
- for(it=primers.begin();it!=primers.end();it++){
- if(it->first.length() > maxLength){
- maxLength = it->first.length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
-
- }else{ alignment = NULL; }
+ if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
int minDiff = 1e6;
string oligo = it->first;
// int length = oligo.length();
- if(rawSequence.length() < maxLength){
+ if(rawSequence.length() < maxFPrimerLength){
success = pdiffs + 100;
break;
}
if ((ldiffs == 0) || (success == 0)) { return success; }
else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
Alignment* alignment;
- if (linker.size() > 0) {
- for(int i = 0; i < linker.size(); i++){
- if(linker[i].length() > maxLength){
- maxLength = linker[i].length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1));
-
- }else{ alignment = NULL; }
+ if (linker.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
int minDiff = 1e6;
string oligo = linker[i];
// int length = oligo.length();
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length
success = ldiffs + 10;
break;
}
if ((ldiffs == 0) || (success == 0)) { return success; }
else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
Alignment* alignment;
- if (linker.size() > 0) {
- for(int i = 0; i < linker.size(); i++){
- if(linker[i].length() > maxLength){
- maxLength = linker[i].length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1));
-
- }else{ alignment = NULL; }
+ if (linker.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
int minDiff = 1e6;
string oligo = linker[i];
// int length = oligo.length();
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length
success = ldiffs + 10;
break;
}
if ((sdiffs == 0) || (success == 0)) { return success; }
else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
Alignment* alignment;
- if (spacer.size() > 0) {
- for(int i = 0; i < spacer.size(); i++){
- if(spacer[i].length() > maxLength){
- maxLength = spacer[i].length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1));
-
- }else{ alignment = NULL; }
+ if (spacer.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
int minDiff = 1e6;
string oligo = spacer[i];
// int length = oligo.length();
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length
success = sdiffs + 10;
break;
}
if ((sdiffs == 0) || (success == 0)) { return success; }
else { //try aligning and see if you can find it
-
- int maxLength = 0;
-
Alignment* alignment;
- if (spacer.size() > 0) {
- for(int i = 0; i < spacer.size(); i++){
- if(spacer[i].length() > maxLength){
- maxLength = spacer[i].length();
- }
- }
- alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1));
-
- }else{ alignment = NULL; }
+ if (spacer.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); }
+ else{ alignment = NULL; }
//can you find the barcode
int minDiff = 1e6;
string oligo = spacer[i];
// int length = oligo.length();
- if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length
success = sdiffs + 10;
break;
}
public:
TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer
- TrimOligos(int,int, int, int, map<int, oligosPair>, map<int, oligosPair>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, linker, spacer
+ TrimOligos(int,int, int, int, map<int, oligosPair>, map<int, oligosPair>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes
~TrimOligos();
int stripBarcode(Sequence&, int&);
vector<string> revPrimer;
vector<string> linker;
vector<string> spacer;
- map<int, oligosPair> ibarcodes;
- map<int, oligosPair> iprimers;
+ map<string, vector<int> > ifbarcodes;
+ map<string, vector<int> > ifprimers;
+ map<string, vector<int> > irbarcodes;
+ map<string, vector<int> > irprimers;
+ map<int, oligosPair> ipbarcodes;
+ map<int, oligosPair> ipprimers;
+
+ int maxFBarcodeLength, maxRBarcodeLength, maxFPrimerLength, maxRPrimerLength, maxLinkerLength, maxSpacerLength;
MothurOut* m;
HANDLE hThreadArray[processors-1];
//Create processor worker threads.
- for( int i=0; i<processors-1; i++){
+ for( int h=0; h<processors-1; h++){
string extension = "";
- if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
+ if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); }
vector<vector<string> > tempFASTAFileNames = fastaFileNames;
vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
vector<vector<string> > tempNameFileNames = nameFileNames;
tempFASTAFileNames,
tempPrimerQualFileNames,
tempNameFileNames,
- lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
+ lines[h].start, lines[h].end, qLines[h].start, qLines[h].end, m,
pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount);
pDataArray.push_back(tempTrim);
- hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ hThreadArray[h] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]);
}
//parent do my part
m->openOutputFile(trimNameFileName, temp); temp.close();
m->openOutputFile(scrapNameFileName, temp); temp.close();
}
+ vector<vector<string> > tempFASTAFileNames = fastaFileNames;
+ vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
+ vector<vector<string> > tempNameFileNames = nameFileNames;
+ if(allFiles){
+ ofstream temp;
+ string extension = toString(processors-1) + ".temp";
+ for(int i=0;i<tempFASTAFileNames.size();i++){
+ for(int j=0;j<tempFASTAFileNames[i].size();j++){
+ if (tempFASTAFileNames[i][j] != "") {
+ tempFASTAFileNames[i][j] += extension;
+ m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
+
+ if(qFileName != ""){
+ tempPrimerQualFileNames[i][j] += extension;
+ m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
+ }
+ if(nameFile != ""){
+ tempNameFileNames[i][j] += extension;
+ m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
+ }
+ }
+ }
+ }
+ }
- driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
+ driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, lines[processors-1], qLines[processors-1]);
processIDS.push_back(processors-1);
//subsample loop
vector< vector<double> > calcDistsTotals; //each iter, each groupCombos dists. this will be used to make .dist files
for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f.
-
if (m->control_pressed) { break; }
//copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
CountTable* newCt = new CountTable();
+ int sampleTime = 0;
+ if (m->debug) { sampleTime = time(NULL); }
+
//uses method of setting groups to doNotIncludeMe
SubSample sample;
Tree* subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize);
-
+
+ if (m->debug) { m->mothurOut("[DEBUG]: iter " + toString(thisIter) + " took " + toString(time(NULL) - sampleTime) + " seconds to sample tree.\n"); }
+
//call new weighted function
vector<double> iterData; iterData.resize(numComp,0);
Weighted thisWeighted(includeRoot);