CommandParameter pmatch("match", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pmatch);
CommandParameter palign("align", "Multiple", "needleman-gotoh-blast-noalign", "needleman", "", "", "","",false,false,true); parameters.push_back(palign);
CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch);
- CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen);
- CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend);
+ CommandParameter pgapopen("gapopen", "Number", "", "-5.0", "", "", "","",false,false); parameters.push_back(pgapopen);
+ CommandParameter pgapextend("gapextend", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapextend);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
CommandParameter pflip("flip", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pflip);
CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave);
helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.";
helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.";
helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.";
- helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.";
- helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.";
+ helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -5.0.";
+ helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.";
helpString += "The flip parameter is used to specify whether or not you want mothur to try the reverse complement if a sequence falls below the threshold. The default is false.";
helpString += "The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. The default threshold is 0.50, meaning 50% of the bases are removed in the alignment.";
helpString += "If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported.";
temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; }
m->mothurConvert(temp, misMatch);
- temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; }
+ temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-5.0"; }
m->mothurConvert(temp, gapOpen);
- temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; }
+ temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-2.0"; }
m->mothurConvert(temp, gapExtend);
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
type = "chimera";
chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
}
- ;
- if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
+
+ if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
try {
int index = 0;
- for(int i=stop-1;i>=start;i--){
+ for(int i=stop-1;i>=start-1;i--){
reverseMap[index++][qScores[i]] += weight;
}
}
catch(exception& e) {
- m->errorOut(e, "QualityScores", "updateForwardMap");
+ m->errorOut(e, "QualityScores", "updateReverseMap");
exit(1);
}
}
QualityScores();
QualityScores(ifstream&);
string getName();
-
+ int getLength(){ return (int)qScores.size(); }
vector<int> getQualityScores() { return qScores; }
void printQScores(ofstream&);
void trimQScores(int, int);
*/
#include "refchimeratest.h"
+#include "myPerseus.h"
#include "mothur.h"
int MAXINT = numeric_limits<int>::max();
exit(1);
}
}
+
//***************************************************************************************************************
int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
+
+ int numParents = -1;
+
+ if(aligned){
+ numParents = analyzeAlignedQuery(queryName, querySeq, chimeraReportFile);
+ }
+ else{
+ numParents = analyzeUnalignedQuery(queryName, querySeq, chimeraReportFile);
+ }
+
+ return numParents;
+
+}
+
+//***************************************************************************************************************
+
+int RefChimeraTest::analyzeAlignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
vector<vector<int> > left(numRefSeqs);
vector<int> singleLeft, bestLeft;
}
right = left;
- int bestSequenceMismatch = getMismatches(querySeq, left, right, bestMatch);
+ int bestSequenceMismatch = getAlignedMismatches(querySeq, left, right, bestMatch);
+
+
int leftParentBi, rightParentBi, breakPointBi;
int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight);
chimeraRefSeq = referenceSeqs[bestMatch];
}
-
- double distToChimera = calcDistToChimera(querySeq, chimeraRefSeq);
+ bestRefAlignment = chimeraRefSeq;
+ bestQueryAlignment = querySeq;
+
+ double distToChimera = calcDistToChimera(bestQueryAlignment, bestRefAlignment);
chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t';
chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t';
return nMera;
}
+//***************************************************************************************************************
+
+int RefChimeraTest::analyzeUnalignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
+
+ int nMera = 0;
+
+ int seqLength = querySeq.length();
+
+ vector<string> queryAlign(numRefSeqs);
+ vector<string> refAlign(numRefSeqs);
+
+ vector<vector<int> > leftDiffs(numRefSeqs, 0);
+ vector<vector<int> > rightDiffs(numRefSeqs, 0);
+ vector<vector<int> > leftMaps(numRefSeqs, 0);
+ vector<vector<int> > rightMaps(numRefSeqs, 0);
+
+ int bestRefIndex = -1;
+ int bestRefDiffs = numeric_limits<int>::max();
+ double bestRefLength = 0;
+
+ for(int i=0;i<numRefSeqs;i++){
+ double length = 0;
+ int diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
+ if(diffs < bestRefDiffs){
+ bestRefDiffs = diffs;
+ bestRefLength = length;
+ bestRefIndex = i;
+ }
+ }
+
+ if(bestRefDiffs >= 3){
+ for(int i=0;i<numRefSeqs;i++){
+ leftDiffs[i].assign(seqLength, 0);
+ rightDiffs[i].assign(seqLength, 0);
+ leftMaps[i].assign(seqLength, 0);
+ rightMaps[i].assign(seqLength, 0);
+
+ getUnalignedDiffs(queryAlign[i], refAlign[i], leftDiffs[i], leftMaps[i], rightDiffs[i], rightMaps[i]);
+ }
+
+ vector<int> singleLeft(seqLength, numeric_limits<int>::max());
+ vector<int> bestLeft(seqLength, -1);
+
+ for(int l=0;l<seqLength;l++){
+
+ for(int i=0;i<numRefSeqs;i++){
+ if(leftDiffs[i][l] < singleLeft[l]){
+ singleLeft[l] = leftDiffs[i][l];
+ bestLeft[l] = i;
+ }
+ }
+ }
+
+ vector<int> singleRight(seqLength, numeric_limits<int>::max());
+ vector<int> bestRight(seqLength, -1);
+
+ for(int l=0;l<seqLength;l++){
+
+ for(int i=0;i<numRefSeqs;i++){
+ if(rightDiffs[i][l] < singleRight[l]){
+ singleRight[l] = rightDiffs[i][l];
+ bestRight[l] = i;
+ }
+ }
+ }
+
+ int bestChimeraMismatches = numeric_limits<int>::max();
+ int leftParent = 0;
+ int rightParent = 0;
+ int breakPoint = 0;
+
+ for(int l=0;l<seqLength-1;l++){
+
+ int chimera = singleLeft[l] + singleRight[seqLength - l - 2];
+ if(chimera < bestChimeraMismatches){
+ bestChimeraMismatches = chimera;
+ breakPoint = l;
+ leftParent = bestLeft[l];
+ rightParent = bestRight[seqLength - l - 2];
+ }
+ }
+
+ string reference;
+
+ if(bestRefDiffs - bestChimeraMismatches >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
+ nMera = 2;
+
+ int breakLeft = leftMaps[leftParent][breakPoint];
+ int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2];
+
+ string left = refAlign[leftParent];
+ string right = refAlign[rightParent];
+
+ for(int i=0;i<=breakLeft;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(left[i] != '-' && left[i] != '.'){
+ reference += left[i];
+ }
+ }
+
+
+ for(int i=breakRight;i<right.length();i++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(right[i] != '-' && right[i] != '.'){
+ reference += right[i];
+ }
+ }
+
+ }
+ else{
+ nMera = 1;
+ reference = referenceSeqs[bestRefIndex];
+ }
+
+ double alignLength;
+ double finalDiffs = alignQueryToReferences(querySeq, reference, bestQueryAlignment, bestRefAlignment, alignLength);
+ double finalDistance = finalDiffs / alignLength;
+
+ chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
+ chimeraReportFile << referenceNames[leftParent] << ',' << referenceNames[rightParent] << '\t' << breakPoint << '\t';
+ chimeraReportFile << bestChimeraMismatches << '\t';
+ chimeraReportFile << '\t' << finalDistance << '\t' << nMera << endl;
+ }
+ else{
+ bestQueryAlignment = queryAlign[bestRefIndex];
+ bestRefAlignment = refAlign[bestRefIndex];
+ nMera = 1;
+
+ chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
+ chimeraReportFile << "NA\tNA\tNA\tNA\t1" << endl;
+ }
+
+ bestMatch = bestRefIndex;
+ return nMera;
+}
+
/**************************************************************************************************/
-int RefChimeraTest::getMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
+double RefChimeraTest::alignQueryToReferences(string query, string reference, string& qAlign, string& rAlign, double& length){
+
+
+ try {
+ double GAP = -5;
+ double MATCH = 1;
+ double MISMATCH = -1;
+
+ int queryLength = query.length();
+ int refLength = reference.length();
+
+ vector<vector<double> > alignMatrix(queryLength + 1);
+ vector<vector<char> > alignMoves(queryLength + 1);
+
+ for(int i=0;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[i].resize(refLength + 1, 0);
+ alignMoves[i].resize(refLength + 1, 'x');
+ }
+
+ for(int i=0;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[i][0] = 0;//GAP * i;
+ alignMoves[i][0] = 'u';
+ }
+
+ for(int i=0;i<=refLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[0][i] = 0;//GAP * i;
+ alignMoves[0][i] = 'l';
+ }
+
+ for(int i=1;i<=queryLength;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ for(int j=1;j<=refLength;j++){
+
+ double nogapScore;
+ if(query[i-1] == reference[j-1]){ nogapScore = alignMatrix[i-1][j-1] + MATCH; }
+ else { nogapScore = alignMatrix[i-1][j-1] + MISMATCH; }
+
+ double leftScore;
+ if(i == queryLength) { leftScore = alignMatrix[i][j-1]; }
+ else { leftScore = alignMatrix[i][j-1] + GAP; }
+
+
+ double upScore;
+ if(j == refLength) { upScore = alignMatrix[i-1][j]; }
+ else { upScore = alignMatrix[i-1][j] + GAP; }
+
+ if(nogapScore > leftScore){
+ if(nogapScore > upScore){
+ alignMoves[i][j] = 'd';
+ alignMatrix[i][j] = nogapScore;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = upScore;
+ }
+ }
+ else{
+ if(leftScore > upScore){
+ alignMoves[i][j] = 'l';
+ alignMatrix[i][j] = leftScore;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = upScore;
+ }
+ }
+ }
+ }
+
+ int end = refLength - 1;
+ int maxRow = 0;
+ double maxRowValue = -100000000000;
+ for(int i=0;i<queryLength;i++){
+ if(alignMatrix[i][end] > maxRowValue){
+ maxRow = i;
+ maxRowValue = alignMatrix[i][end];
+ }
+ }
+
+ end = queryLength - 1;
+ int maxColumn = 0;
+ double maxColumnValue = -100000000000;
+
+ for(int j=0;j<refLength;j++){
+ if(alignMatrix[end][j] > maxColumnValue){
+ maxColumn = j;
+ maxColumnValue = alignMatrix[end][j];
+ }
+ }
+
+ int row = queryLength-1;
+ int column = refLength-1;
+
+ if(maxColumn == column && maxRow == row){} // if the max values are the lower right corner, then we're good
+ else if(alignMatrix[row][maxColumn] < alignMatrix[maxRow][column]){
+ for(int i=maxRow+1;i<queryLength;i++){ // decide whether sequence A or B needs the gaps at the end either set
+ alignMoves[i][column] = 'u';// the pointer upwards or...
+ }
+
+ }
+ else {
+ for(int i=maxColumn+1;i<refLength;i++){
+ alignMoves[row][i] = 'l'; // ...to the left
+ }
+ }
+
+ int i = queryLength;
+ int j = refLength;
+
+
+ qAlign = "";
+ rAlign = "";
+
+ int diffs = 0;
+ length = 0;
+
+ while(i > 0 && j > 0){
+
+ if (m->control_pressed) { return 0; }
+
+ if(alignMoves[i][j] == 'd'){
+ qAlign = query[i-1] + qAlign;
+ rAlign = reference[j-1] + rAlign;
+
+ if(query[i-1] != reference[j-1]){ diffs++; }
+ length++;
+
+ i--;
+ j--;
+ }
+ else if(alignMoves[i][j] == 'u'){
+ qAlign = query[i-1] + qAlign;
+
+ if(j != refLength) { rAlign = '-' + rAlign; diffs++; length++; }
+ else { rAlign = '.' + rAlign; }
+ i--;
+ }
+ else if(alignMoves[i][j] == 'l'){
+ rAlign = reference[j-1] + rAlign;
+
+ if(i != queryLength){ qAlign = '-' + qAlign; diffs++; length++; }
+ else { qAlign = '.' + qAlign; }
+ j--;
+ }
+ }
+
+ if(i>0){
+ qAlign = query.substr(0, i) + qAlign;
+ rAlign = string(i, '.') + rAlign;
+ }
+ else if(j>0){
+ qAlign = string(j, '.') + qAlign;
+ rAlign = reference.substr(0, j) + rAlign;
+ }
+
+
+ return diffs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RefChimeraTest", "alignQueryToReferences");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int RefChimeraTest::getUnalignedDiffs(string qAlign, string rAlign, vector<int>& leftDiffs, vector<int>& leftMap, vector<int>& rightDiffs, vector<int>& rightMap){
+ try {
+ int alignLength = qAlign.length();
+
+ int lDiffs = 0;
+ int lCount = 0;
+ for(int l=0;l<alignLength;l++){
+
+ if (m->control_pressed) { return 0; }
+
+ if(qAlign[l] == '-'){
+ lDiffs++;
+ }
+ else if(qAlign[l] != '.'){
+
+ if(rAlign[l] == '-'){
+ lDiffs++;
+ }
+ else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
+ lDiffs++;
+ }
+ leftDiffs[lCount] = lDiffs;
+ leftMap[lCount] = l;
+
+ lCount++;
+ }
+ }
+
+ int rDiffs = 0;
+ int rCount = 0;
+ for(int l=alignLength-1;l>=0;l--){
+
+ if (m->control_pressed) { return 0; }
+
+ if(qAlign[l] == '-'){
+ rDiffs++;
+ }
+ else if(qAlign[l] != '.'){
+
+ if(rAlign[l] == '-'){
+ rDiffs++;
+ }
+ else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
+ rDiffs++;
+ }
+
+ rightDiffs[rCount] = rDiffs;
+ rightMap[rCount] = l;
+ rCount++;
+ }
+
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RefChimeraTest", "getUnalignedDiffs");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int RefChimeraTest::getAlignedMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
int bestSequenceMismatch = MAXINT;
}
//***************************************************************************************************************
+
+string RefChimeraTest::getQueryAlignment(){
+
+ return bestQueryAlignment;
+
+}
+
+//***************************************************************************************************************
+
+string RefChimeraTest::getClosestRefAlignment(){
+
+ return bestRefAlignment;
+
+}
+
+//***************************************************************************************************************
class RefChimeraTest {
public:
- RefChimeraTest(vector<Sequence>&, bool);
+ RefChimeraTest(){};
+ RefChimeraTest(vector<Sequence>&, bool);
int printHeader(ofstream&);
- int analyzeQuery(string, string, ofstream&);
+ int analyzeQuery(string, string, ofstream&);
int getClosestRefIndex();
+ string getClosestRefAlignment();
+ string getQueryAlignment();
+
private:
- int getMismatches(string&, vector<vector<int> >&, vector<vector<int> >&, int&);
- int getChimera(vector<vector<int> >&, vector<vector<int> >&, int&, int&, int&, vector<int>&, vector<int>&, vector<int>&, vector<int>&);
+ int getAlignedMismatches(string&, vector<vector<int> >&, vector<vector<int> >&, int&);
+ int analyzeAlignedQuery(string, string, ofstream&);
+ int analyzeUnalignedQuery(string, string, ofstream&);
+ double alignQueryToReferences(string, string, string&, string&, double&);
+ int getUnalignedDiffs(string, string, vector<int>&, vector<int>&, vector<int>&, vector<int>&);
+
+ int getChimera(vector<vector<int> >&, vector<vector<int> >&, int&, int&, int&, vector<int>&, vector<int>&, vector<int>&, vector<int>&);
int getTrimera(vector<vector<int> >&, vector<vector<int> >&, int&, int&, int&, int&, int&, vector<int>&, vector<int>&, vector<int>&, vector<int>&);
string stitchBimera(int, int, int);
string stitchTrimera(int, int, int, int, int);
int numRefSeqs;
int alignLength;
int bestMatch;
+ string bestRefAlignment;
+ string bestQueryAlignment;
//ofstream chimeraReportFile;
bool aligned;
#include "reportfile.h"
#include "qualityscores.h"
#include "refchimeratest.h"
+#include "myPerseus.h"
#include "filterseqscommand.h"
//**********************************************************************************************************************
+
vector<string> SeqErrorCommand::setParameters(){
try {
CommandParameter pquery("fasta", "InputTypes", "", "", "none", "none", "none","errorType",false,true,true); parameters.push_back(pquery);
CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pname);
CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pignorechimeras);
CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pthreshold);
- CommandParameter paligned("aligned", "Boolean", "T", "", "", "", "","",false,false); parameters.push_back(paligned);
+ CommandParameter paligned("aligned", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(paligned);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
exit(1);
}
}
+
//**********************************************************************************************************************
+
string SeqErrorCommand::getHelpString(){
try {
string helpString = "";
exit(1);
}
}
+
//**********************************************************************************************************************
+
string SeqErrorCommand::getOutputPattern(string type) {
try {
string pattern = "";
exit(1);
}
}
+
//**********************************************************************************************************************
+
SeqErrorCommand::SeqErrorCommand(){
try {
abort = true; calledHelp = true;
exit(1);
}
}
+
//***************************************************************************************************************
SeqErrorCommand::SeqErrorCommand(string option) {
if(reportFileName == "not found"){ reportFileName = ""; }
else if (reportFileName == "not open") { reportFileName = ""; abort = true; }
- if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){
- m->mothurOut("if you use either a qual file or a report file, you have to have both.");
- m->mothurOutEndLine();
- abort = true;
- }
-
outputDir = validParameter.validFile(parameters, "outputdir", false);
if (outputDir == "not found"){
outputDir = "";
temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; }
m->mothurConvert(temp, threshold);
- temp = validParameter.validFile(parameters, "aligned", true); if (temp == "not found"){ temp = "t"; }
- aligned = m->isTrue(temp);
-// rdb->aligned = aligned; #do we need these lines for aligned?
-// if (aligned) { //clear out old references
-// rdb->clearMemory();
-// }
temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; }
save = m->isTrue(temp);
temp = validParameter.validFile(parameters, "ignorechimeras", false); if (temp == "not found") { temp = "T"; }
ignoreChimeras = m->isTrue(temp);
+ temp = validParameter.validFile(parameters, "aligned", false); if (temp == "not found"){ temp = "t"; }
+ aligned = m->isTrue(temp);
+
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
m->mothurConvert(temp, processors);
vector<string> files; files.push_back(queryFileName);
parser.getNameFile(files);
}
+
+ if(aligned == true){
+ if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){
+ m->mothurOut("if you use either a qual file or a report file, you have to have both.");
+ m->mothurOutEndLine();
+ abort = true;
+ }
+ }
+ else{
+ if(reportFileName != ""){
+ m->mothurOut("we are ignoring the report file if your sequences are not aligned. we will check that the sequences in your fasta and and qual fileare the same length.");
+ m->mothurOutEndLine();
+ }
+ }
+
+
}
}
catch(exception& e) {
exit(1);
}
}
+
//***************************************************************************************************************
int SeqErrorCommand::execute(){
if (abort == true) { if (calledHelp) { return 0; } return 2; }
int start = time(NULL);
- maxLength = 2000;
+ maxLength = 5000;
totalBases = 0;
totalMatches = 0;
for (int i = 0; i < (fastaFilePos.size()-1); i++) {
lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
if (qualFileName != "") { qLines.push_back(linePair(qFilePos[i], qFilePos[(i+1)])); }
- if (reportFileName != "") { rLines.push_back(linePair(reportFilePos[i], reportFilePos[(i+1)])); }
+ if (reportFileName != "" && aligned == true) { rLines.push_back(linePair(reportFilePos[i], reportFilePos[(i+1)])); }
}
if(qualFileName == "") { qLines = lines; rLines = lines; } //fills with duds
-
+ if(aligned == false){ rLines = lines; }
int numSeqs = 0;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]);
+
}else{
numSeqs = createProcesses(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName);
}
#else
numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]);
#endif
-
- if(qualFileName != "" && reportFileName != ""){
+
+ if(qualFileName != ""){
printErrorQuality(qScoreErrorMap);
printQualityFR(qualForwardMap, qualReverseMap);
}
}
errorCountFile.close();
- if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+// if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
printSubMatrix();
-
+
string megAlignmentFileName = getOutputFileName("errorref-query",variables);
ofstream megAlignmentFile;
m->openOutputFile(megAlignmentFileName, megAlignmentFile);
outputNames.push_back(megAlignmentFileName); outputTypes["errorref-query"].push_back(megAlignmentFileName);
-
+
for(int i=0;i<numRefs;i++){
megAlignmentFile << referenceSeqs[i].getInlineSeq() << endl;
megAlignmentFile << megaAlignVector[i] << endl;
}
+ megAlignmentFile.close();
+
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences.");
m->mothurOutEndLine();
exit(1);
}
}
+
//**********************************************************************************************************************
+
int SeqErrorCommand::createProcesses(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName) {
try {
int process = 1;
exit(1);
}
}
+
//**********************************************************************************************************************
+
int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName, linePair line, linePair qline, linePair rline) {
try {
//open inputfiles and go to beginning place for this processor
ifstream queryFile;
m->openInputFile(filename, queryFile);
+
queryFile.seekg(line.start);
ifstream reportFile;
ifstream qualFile;
- if(qFileName != "" && rFileName != ""){
+ if((qFileName != "" && rFileName != "" && aligned)){
m->openInputFile(qFileName, qualFile);
qualFile.seekg(qline.start);
qualReverseMap[i].assign(41,0);
}
}
-
+ else if(qFileName != "" && !aligned){
+
+ m->openInputFile(qFileName, qualFile);
+ qualFile.seekg(qline.start);
+
+ qualForwardMap.resize(maxLength);
+ qualReverseMap.resize(maxLength);
+ for(int i=0;i<maxLength;i++){
+ qualForwardMap[i].assign(41,0);
+ qualReverseMap[i].assign(41,0);
+ }
+ }
+
ofstream outChimeraReport;
m->openOutputFile(chimeraOutputFileName, outChimeraReport);
- RefChimeraTest chimeraTest(referenceSeqs, aligned);
- if (line.start == 0) { chimeraTest.printHeader(outChimeraReport); }
+ RefChimeraTest chimeraTest;
+
+ chimeraTest = RefChimeraTest(referenceSeqs, aligned);
+ if (line.start == 0) { chimeraTest.printHeader(outChimeraReport); }
+
ofstream errorSummaryFile;
m->openOutputFile(summaryFileName, errorSummaryFile);
ofstream errorSeqFile;
m->openOutputFile(errorOutputFileName, errorSeqFile);
- megaAlignVector.resize(numRefs, "");
+ megaAlignVector.assign(numRefs, "");
int index = 0;
bool ignoreSeq = 0;
bool moreSeqs = 1;
while (moreSeqs) {
-
- if (m->control_pressed) { queryFile.close(); if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); } outChimeraReport.close(); errorSummaryFile.close();errorSeqFile.close(); return 0; }
-
+
Sequence query(queryFile);
-
- int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned(), outChimeraReport);
- int closestRefIndex = chimeraTest.getClosestRefIndex();
-
+ Sequence reference;
+ int numParentSeqs = -1;
+ int closestRefIndex = -1;
+
+ numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned(), outChimeraReport);
+
+ closestRefIndex = chimeraTest.getClosestRefIndex();
+
+ reference = referenceSeqs[closestRefIndex];
+
+ reference.setAligned(chimeraTest.getClosestRefAlignment());
+ query.setAligned(chimeraTest.getQueryAlignment());
+
if(numParentSeqs > 1 && ignoreChimeras == 1) { ignoreSeq = 1; }
- else { ignoreSeq = 0; }
-
+ else { ignoreSeq = 0; }
+
Compare minCompare;
- getErrors(query, referenceSeqs[closestRefIndex], minCompare);
+
+ getErrors(query, reference, minCompare);
if(namesFileName != ""){
it = weights.find(query.getName());
}
else{ minCompare.weight = 1; }
+
printErrorData(minCompare, numParentSeqs, errorSummaryFile, errorSeqFile);
if(!ignoreSeq){
-
for(int i=0;i<minCompare.sequence.length();i++){
char letter = minCompare.sequence[i];
-
if(letter != 'r'){
errorForward[letter][i] += minCompare.weight;
- errorReverse[letter][minCompare.total-i-1] += minCompare.weight;
+ errorReverse[letter][minCompare.total-i-1] += minCompare.weight;
}
- }
+ }
}
- if(qualFileName != "" && reportFileName != ""){
+ if(aligned && qualFileName != "" && reportFileName != ""){
report = ReportFile(reportFile);
// int origLength = report.getQueryLength();
quality = QualityScores(qualFile);
if(!ignoreSeq){
-// cout << query.getName() << '\t';
-
quality.updateQScoreErrorMap(qScoreErrorMap, minCompare.sequence, startBase, endBase, minCompare.weight);
quality.updateForwardMap(qualForwardMap, startBase, endBase, minCompare.weight);
quality.updateReverseMap(qualReverseMap, startBase, endBase, minCompare.weight);
+ }
+ }
+ else if(aligned == false && qualFileName != ""){
+
+ quality = QualityScores(qualFile);
+ int qualityLength = quality.getLength();
+
+ if(qualityLength != query.getNumBases()){ cout << "warning - quality and fasta sequence files do not match at " << query.getName() << '\t' << qualityLength <<'\t' << query.getNumBases() << endl; }
+
+ int startBase = 1;
+ int endBase = qualityLength;
-// cout << endl;
+ if(!ignoreSeq){
+ quality.updateQScoreErrorMap(qScoreErrorMap, minCompare.sequence, startBase, endBase, minCompare.weight);
+ quality.updateForwardMap(qualForwardMap, startBase, endBase, minCompare.weight);
+ quality.updateReverseMap(qualReverseMap, startBase, endBase, minCompare.weight);
}
- }
-
- if(minCompare.errorRate < threshold && !ignoreSeq){
+ }
+
+ if(minCompare.errorRate <= threshold && !ignoreSeq){
totalBases += (minCompare.total * minCompare.weight);
totalMatches += minCompare.matches * minCompare.weight;
if(minCompare.mismatches > maxMismatch){
if(index % 100 == 0){ m->mothurOut(toString(index)); m->mothurOutEndLine(); }
}
queryFile.close();
- if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); }
- errorSummaryFile.close();
+ outChimeraReport.close();
+ errorSummaryFile.close();
errorSeqFile.close();
-
+
+ if(qFileName != "" && rFileName != "") { reportFile.close(); qualFile.close(); }
+ else if(qFileName != "" && aligned == false){ qualFile.close(); }
+
//report progress
- if(index % 100 != 0){ m->mothurOut(toString(index)); m->mothurOutEndLine(); }
+ m->mothurOut(toString(index)); m->mothurOutEndLine();
return index;
}
exit(1);
}
}
+
//***************************************************************************************************************
void SeqErrorCommand::getReferences(){
int started = 0;
//Compare errors;
-
+
+ errors.sequence = "";
for(int i=0;i<alignLength;i++){
-// cout << r[i] << '\t' << q[i] << '\t';
+
if(q[i] != '.' && r[i] != '.' && (q[i] != '-' || r[i] != '-')){ // no missing data and no double gaps
if(r[i] != 'N'){
started = 1;
errors.sequence += 'r';
}
}
- }
-
+ }
else if(q[i] == '.' && r[i] != '.'){ // reference extends beyond query
if(started == 1){ break; }
}
else if(q[i] == '.' && r[i] == '.'){ // both are missing data
if(started == 1){ break; }
}
-// cout << errors.sequence[errors.sequence.length()-1] << endl;
}
-// cout << errors.sequence << endl;
+
errors.mismatches = errors.total-errors.matches;
- errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
+ if(errors.total != 0){ errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total; }
+ else{ errors.errorRate = 0; }
+
errors.queryName = query.getName();
errors.refName = reference.getName();
- //return errors;
+
return 0;
}
catch(exception& e) {
nameCountMap[seqName] = m->getNumNames(redundantSeqs);
m->gobble(nameFile);
}
+
+ nameFile.close();
+
return nameCountMap;
}
-
//***************************************************************************************************************
void SeqErrorCommand::printErrorHeader(ofstream& errorSummaryFile){
exit(1);
}
}
+
//***************************************************************************************************************
void SeqErrorCommand::printErrorFRFile(map<char, vector<int> > errorForward, map<char, vector<int> > errorReverse){
errorQualityFile.close();
}
catch(exception& e) {
- m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
+ m->errorOut(e, "SeqErrorCommand", "printErrorQuality");
exit(1);
}
}
-
//***************************************************************************************************************
void SeqErrorCommand::printQualityFR(vector<vector<int> > qualForwardMap, vector<vector<int> > qualReverseMap){
qualityReverseFile.close();
}
catch(exception& e) {
- m->errorOut(e, "SeqErrorCommand", "printErrorFRFile");
+ m->errorOut(e, "SeqErrorCommand", "printQualityFR");
exit(1);
}
}
+
/**************************************************************************************************/
int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos, vector<unsigned long long>& rfileFilePos) {
qfileFilePos.push_back(size);
- //seach for filePos of each first name in the rfile and save in rfileFilePos
- string junk;
- ifstream inR;
- m->openInputFile(rfilename, inR);
-
- //read column headers
- for (int i = 0; i < 16; i++) {
- if (!inR.eof()) { inR >> junk; }
- else { break; }
- }
-
- while(!inR.eof()){
-
- if (m->control_pressed) { inR.close(); return processors; }
-
- input = m->getline(inR);
-
- if (input.length() != 0) {
-
- istringstream nameStream(input);
- string sname = ""; nameStream >> sname;
-
- map<string, int>::iterator it = firstSeqNamesReport.find(sname);
-
- if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk
- unsigned long long pos = inR.tellg();
- rfileFilePos.push_back(pos - input.length() - 1);
- firstSeqNamesReport.erase(it);
- }
- }
-
- if (firstSeqNamesReport.size() == 0) { break; }
- m->gobble(inR);
- }
- inR.close();
-
- if (firstSeqNamesReport.size() != 0) {
- for (map<string, int>::iterator it = firstSeqNamesReport.begin(); it != firstSeqNamesReport.end(); it++) {
- m->mothurOut(it->first + " is in your fasta file and not in your report file, aborting."); m->mothurOutEndLine();
- }
- m->control_pressed = true;
- return processors;
- }
-
- //get last file position of qfile
- FILE * rFile;
- unsigned long long sizeR;
-
- //get num bytes in file
- rFile = fopen (rfilename.c_str(),"rb");
- if (rFile==NULL) perror ("Error opening file");
- else{
- fseek (rFile, 0, SEEK_END);
- sizeR=ftell (rFile);
- fclose (rFile);
+ if(aligned){
+ //seach for filePos of each first name in the rfile and save in rfileFilePos
+ string junk;
+ ifstream inR;
+
+ m->openInputFile(rfilename, inR);
+
+ //read column headers
+ for (int i = 0; i < 16; i++) {
+ if (!inR.eof()) { inR >> junk; }
+ else { break; }
+ }
+
+ while(!inR.eof()){
+
+ input = m->getline(inR);
+
+ if (input.length() != 0) {
+
+ istringstream nameStream(input);
+ string sname = ""; nameStream >> sname;
+
+ map<string, int>::iterator it = firstSeqNamesReport.find(sname);
+
+ if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk
+ unsigned long long pos = inR.tellg();
+ rfileFilePos.push_back(pos - input.length() - 1);
+ firstSeqNamesReport.erase(it);
+ }
+ }
+
+ if (firstSeqNamesReport.size() == 0) { break; }
+ m->gobble(inR);
+ }
+ inR.close();
+
+ if (firstSeqNamesReport.size() != 0) {
+ for (map<string, int>::iterator it = firstSeqNamesReport.begin(); it != firstSeqNamesReport.end(); it++) {
+ m->mothurOut(it->first + " is in your fasta file and not in your report file, aborting."); m->mothurOutEndLine();
+ }
+ m->control_pressed = true;
+ return processors;
+ }
+
+ //get last file position of qfile
+ FILE * rFile;
+ unsigned long long sizeR;
+
+ //get num bytes in file
+ rFile = fopen (rfilename.c_str(),"rb");
+ if (rFile==NULL) perror ("Error opening file");
+ else{
+ fseek (rFile, 0, SEEK_END);
+ sizeR=ftell (rFile);
+ fclose (rFile);
+ }
+
+ rfileFilePos.push_back(sizeR);
}
-
- rfileFilePos.push_back(sizeR);
-
return processors;
#else
exit(1);
}
}
+
//***************************************************************************************************************
else if (type == "fasta") { pattern = "[filename],[tag],fasta"; }
else if (type == "group") { pattern = "[filename],groups"; }
else if (type == "name") { pattern = "[filename],[tag],names"; }
- else if (type == "count") { pattern = "[filename],[tag],count_table"; }
+ else if (type == "count") { pattern = "[filename],[tag],count_table-[filename],count_table"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
return pattern;
map<string, string> myvariables;
myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(it->first));
string thisGroupName = "";
- if (countfile == "") { thisGroupName = getOutputFileName("group",variables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); }
- else { thisGroupName = getOutputFileName("count",variables); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); }
+ if (countfile == "") { thisGroupName = getOutputFileName("group",myvariables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); }
+ else { thisGroupName = getOutputFileName("count",myvariables); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); }
m->openOutputFile(thisGroupName, out);
if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; }