AlignCommand::AlignCommand(){
try {
globaldata = GlobalData::getInstance();
- candidateFileName = globaldata->getCandidateFile();
- templateFileName = globaldata->getFastaFile();
- openInputFile(candidateFileName, in);
+ if(globaldata->getFastaFile() == "" && globaldata->getPhylipFile() == "" && globaldata->getNexusFile() == "" && globaldata->getClustalFile() == ""){
+ cout << "you forgot a template file" << endl;
+ }
+ openInputFile(globaldata->getCandidateFile(), in);
+
convert(globaldata->getKSize(), kmerSize);
convert(globaldata->getMatch(), match);
convert(globaldata->getMismatch(), misMatch);
srand( (unsigned)time( NULL ) ); //needed to assign names to temporary files
Database* templateDB;
- if(globaldata->getSearch() == "kmer") { templateDB = new KmerDB(templateFileName, kmerSize); }
- else if(globaldata->getSearch() == "suffix") { templateDB = new SuffixDB(templateFileName); }
- else if(globaldata->getSearch() == "blast") { templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch); }
- else { cout << globaldata->getSearch() << " is not a valid search option. I will run the command using suffix." << endl;
- templateDB = new SuffixDB(templateFileName); }
+ if(globaldata->getSearch() == "kmer") { templateDB = new KmerDB(globaldata->getFastaFile() , kmerSize); }
+ else if(globaldata->getSearch() == "suffix") { templateDB = new SuffixDB(globaldata->getFastaFile()); }
+ else if(globaldata->getSearch() == "blast") { templateDB = new BlastDB(globaldata->getFastaFile(), gapOpen, gapExtend, match, misMatch); }
+ else {
+ cout << globaldata->getSearch() << " is not a valid search option. I will run the command using kmer, ksize=8." << endl;
+ templateDB = new KmerDB(globaldata->getFastaFile(), kmerSize);
+ }
Alignment* alignment;
- if(globaldata->getAlign() == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000); }
- else if(globaldata->getAlign() == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000); }
- else if(globaldata->getAlign() == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
- else if(globaldata->getAlign() == "noalign") { alignment = new NoAlign(); }
- else { cout << globaldata->getAlign() << " is not a valid alignment option. I will run the command using blast." << endl;
- alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
+ if(globaldata->getAlign() == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000); }
+ else if(globaldata->getAlign() == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000); }
+ else if(globaldata->getAlign() == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
+ else if(globaldata->getAlign() == "noalign") { alignment = new NoAlign(); }
+ else {
+ cout << globaldata->getAlign() << " is not a valid alignment option. I will run the command using needleman." << endl;
+ alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000);
+ }
int numFastaSeqs=count(istreambuf_iterator<char>(in),istreambuf_iterator<char>(), '>');
in.seekg(0);
- string candidateAligngmentFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.align";
+ candidateFileName = globaldata->getCandidateFile();
+ string candidateAligngmentFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align";
ofstream candidateAlignmentFile;
openOutputFile(candidateAligngmentFName, candidateAlignmentFile);
- string candidateReportFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.report";
+ string candidateReportFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align.report";
NastReport report(candidateReportFName);
cout << "We are going to align the " << numFastaSeqs << " sequences in " << candidateFileName << "..." << endl;
Sequence* templateSeq = templateDB->findClosestSequence(candidateSeq);
report.setTemplate(templateSeq);
report.setSearchParameters(globaldata->getSearch(), templateDB->getSearchScore());
+
Nast nast(alignment, candidateSeq, templateSeq);
report.setAlignmentParameters(globaldata->getAlign(), alignment);
class AlignCommand : public Command {
- public:
- AlignCommand();
- ~AlignCommand();
- int execute();
-
- private:
- GlobalData* globaldata;
- string candidateFileName, templateFileName, distanceFileName;
- int kmerSize;
- float match, misMatch, gapOpen, gapExtend;
- ofstream out;
- ifstream in;
+public:
+ AlignCommand();
+ ~AlignCommand();
+ int execute();
+
+private:
+ GlobalData* globaldata;
+ string candidateFileName, templateFileName, distanceFileName;
+ int kmerSize;
+ float match, misMatch, gapOpen, gapExtend;
+ ofstream out;
+ ifstream in;
};
cout << endl << "Reading in the " << fastaFileName << " template sequences...\t"; cout.flush();
+ //all of this is elsewhere already!
numSeqs=count(istreambuf_iterator<char>(fastaFile),istreambuf_iterator<char>(), '>'); // count the number of
fastaFile.seekg(0); // sequences
string seqName, sequence;
for(int i=0;i<numSeqs;i++){
-// templateSequences[i] = new Sequence(); // We're storing the aligned template sequences as a vector of
- // pointers to Sequence objects
fastaFile >> seqName;
-// templateSequences[i]->setName(seqName);
-
+ seqName = seqName.substr(1);
char letter;
string aligned;
}
}
templateSequences[i] = new Sequence(seqName, aligned);
-// templateSequences[i]->setAligned(aligned); // Obviously, we need the fully aligned template sequence
-// templateSequences[i]->setUnaligned(aligned);// We will also need the unaligned sequence for pairwise alignments
- fastaFile.putback(letter); // and database searches
+ fastaFile.putback(letter);
}
fastaFile.close();
+ //all of this is elsewhere already!
+
cout << "DONE." << endl; cout.flush();
}
FilterSeqsCommand::FilterSeqsCommand(){
globaldata = GlobalData::getInstance();
- if(globaldata->getFastaFile() != "") { readSeqs = new ReadFasta(globaldata->inputFileName); }
- else if(globaldata->getNexusFile() != "") { readSeqs = new ReadNexus(globaldata->inputFileName); }
- else if(globaldata->getClustalFile() != "") { readSeqs = new ReadClustal(globaldata->inputFileName); }
- else if(globaldata->getPhylipFile() != "") { readSeqs = new ReadPhylip(globaldata->inputFileName); }
-
- readSeqs->read();
- db = readSeqs->getDB();
- numSeqs = db->size();
-
- alignmentLength = db->get(0).getAlignLength();
-
- filter = string(alignmentLength, '1');
+ if(globaldata->getFastaFile() == "") { cout << "You must enter a fasta formatted file" << endl; }
+ trump = globaldata->getTrump()[0];
+ vertical =
+// readSeqs->read();
+// db = readSeqs->getDB();
+// numSeqs = db->size();
+//
+// alignmentLength = db->get(0).getAlignLength();
+//
+// filter = string(alignmentLength, '1');
}
/**************************************************************************************/
void FilterSeqsCommand::doHard() {
- string hardName = globaldata->getHard();
- string hardFilter = "";
-
- ifstream fileHandle;
- openInputFile(hardName, fileHandle);
-
- fileHandle >> hardFilter;
-
- if(hardFilter.length() != filter.length()){
- cout << "The hard filter is not the same length as the alignment: Hard filter will not be applied." << endl;
- }
- else{
- filter = hardFilter;
- }
-
+// string hardName = globaldata->getHard();
+// string hardFilter = "";
+//
+// ifstream fileHandle;
+// openInputFile(hardName, fileHandle);
+//
+// fileHandle >> hardFilter;
+//
+// if(hardFilter.length() != filter.length()){
+// cout << "The hard filter is not the same length as the alignment: Hard filter will not be applied." << endl;
+// }
+// else{
+// filter = hardFilter;
+// }
+
}
/**************************************************************************************/
void FilterSeqsCommand::doTrump() {
- char trump = globaldata->getTrump()[0];
for(int i = 0; i < numSeqs; i++) {
string curAligned = db->get(i).getAligned();;
void FilterSeqsCommand::doVertical() {
- vector<int> counts(alignmentLength, 0);
-
- for(int i = 0; i < numSeqs; i++) {
- string curAligned = db->get(i).getAligned();;
-
- for(int j = 0; j < alignmentLength; j++) {
- if(curAligned[j] == '-' || curAligned[j] == '.'){
- counts[j]++;
- }
- }
- }
- for(int i=0;i<alignmentLength;i++){
- if(counts[i] == numSeqs) { filter[i] = '0'; }
- }
+// vector<int> counts(alignmentLength, 0);
+//
+// for(int i = 0; i < numSeqs; i++) {
+// string curAligned = db->get(i).getAligned();;
+//
+// for(int j = 0; j < alignmentLength; j++) {
+// if(curAligned[j] == '-' || curAligned[j] == '.'){
+// counts[j]++;
+// }
+// }
+// }
+// for(int i=0;i<alignmentLength;i++){
+// if(counts[i] == numSeqs) { filter[i] = '0'; }
+// }
}
/**************************************************************************************/
void FilterSeqsCommand::doSoft() {
- int softThreshold = numSeqs * (float)atoi(globaldata->getSoft().c_str()) / 100.0;
-
- vector<int> a(alignmentLength, 0);
- vector<int> t(alignmentLength, 0);
- vector<int> g(alignmentLength, 0);
- vector<int> c(alignmentLength, 0);
- vector<int> x(alignmentLength, 0);
-
- for(int i=0;i<numSeqs;i++){
- string curAligned = db->get(i).getAligned();;
-
- for(int j=0;j<alignmentLength;j++){
- if(toupper(curAligned[j]) == 'A') { a[j]++; }
- else if(toupper(curAligned[j]) == 'T' || toupper(curAligned[i]) == 'U') { t[j]++; }
- else if(toupper(curAligned[j]) == 'G') { g[j]++; }
- else if(toupper(curAligned[j]) == 'C') { c[j]++; }
- }
- }
-
- for(int i=0;i<alignmentLength;i++){
- if(a[i] < softThreshold && t[i] < softThreshold && g[i] < softThreshold && c[i] < softThreshold){
- filter[i] = '0';
- }
- }
+// int softThreshold = numSeqs * (float)atoi(globaldata->getSoft().c_str()) / 100.0;
+//
+// vector<int> a(alignmentLength, 0);
+// vector<int> t(alignmentLength, 0);
+// vector<int> g(alignmentLength, 0);
+// vector<int> c(alignmentLength, 0);
+// vector<int> x(alignmentLength, 0);
+//
+// for(int i=0;i<numSeqs;i++){
+// string curAligned = db->get(i).getAligned();;
+//
+// for(int j=0;j<alignmentLength;j++){
+// if(toupper(curAligned[j]) == 'A') { a[j]++; }
+// else if(toupper(curAligned[j]) == 'T' || toupper(curAligned[i]) == 'U') { t[j]++; }
+// else if(toupper(curAligned[j]) == 'G') { g[j]++; }
+// else if(toupper(curAligned[j]) == 'C') { c[j]++; }
+// }
+// }
+//
+// for(int i=0;i<alignmentLength;i++){
+// if(a[i] < softThreshold && t[i] < softThreshold && g[i] < softThreshold && c[i] < softThreshold){
+// filter[i] = '0';
+// }
+// }
}
/**************************************************************************************/
int FilterSeqsCommand::execute() {
try {
-
- if(globaldata->getHard().compare("") != 0) { doHard(); } // has to be applied first!
- if(globaldata->getTrump().compare("") != 0) { doTrump(); }
- if(globaldata->getVertical() == "T") { doVertical(); }
- if(globaldata->getSoft().compare("") != 0) { doSoft(); }
+
+ ifstream inFASTA;
+ openInputFile(globaldata->getFastaFile(), inFASTA);
+
+ Sequence currSequence(inFASTA);
+ alignmentLength = currSequence.getAlignLength();
+
+ //while
+
+
+ if(globaldata->getHard().compare("") != 0) { doHard(); } // has to be applied first!
+ if(globaldata->getTrump().compare("") != 0) { doTrump(); }
+ if(isTrue(globaldata->getVertical())) { doVertical(); }
+ if(globaldata->getSoft().compare("") != 0) { doSoft(); }
+
+
+
+
+
+
ofstream outfile;
string filterFile = getRootName(globaldata->inputFileName) + "filter";
openOutputFile(filterFile, outfile);
void doSoft();
void doHard();
void doVertical();
-
+ string filter;
int alignmentLength;
- int numSeqs;
+
+ char trump;
+ bool vertical;
GlobalData* globaldata;
- ReadSeqs* readSeqs;
- SequenceDB* db;
+// ReadSeqs* readSeqs;
+// SequenceDB* db;
- string filter;
};
if (key == "nexus" ) { nexusfile = value; inputFileName = value; fileroot = value; format = "nexus"; }
if (key == "clustal" ) { clustalfile = value; inputFileName = value; fileroot = value; format = "clustal"; }
if (key == "tree" ) { treefile = value; inputFileName = value; fileroot = value; format = "tree"; }
- if (key == "shared" ) { sharedfile = value; inputFileName = value; fileroot = value; format = "sharedfile"; }
+ if (key == "shared" ) { sharedfile = value; inputFileName = value; fileroot = value; format = "sharedfile"; }
if (key == "name" ) { namefile = value; }
if (key == "order" ) { orderfile = value; }
if (key == "group" ) { groupfile = value; }
processors = "1";
size = "0";
search = "kmer";
- ksize = "7";
+ ksize = "8";
align = "needleman";
match = "1.0";
mismatch = "-1.0";
processors = "1";
size = "0";
search = "kmer";
- ksize = "7";
+ ksize = "8";
align = "needleman";
match = "1.0";
mismatch = "-1.0";
/**************************************************************************************************/
Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
-
- vector<int> matches(numSeqs, 0); // a record of the sequences with shared kmers
- vector<int> timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found
+
+ Kmer kmer(kmerSize);
searchScore = 0;
int maxSequence = 0;
-
- string query = candidateSeq->getUnaligned(); // we want to search using an unaligned dna sequence
- int numKmers = query.length() - kmerSize + 1;
- Kmer kmer(kmerSize);
-
+ vector<int> matches(numSeqs, 0); // a record of the sequences with shared kmers
+ vector<int> timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found
+
+ int numKmers = candidateSeq->getNumBases() - kmerSize + 1;
+
for(int i=0;i<numKmers;i++){
- int kmerNumber = kmer.getKmerNumber(query, i); // go through the query sequence and get a kmer number
+ int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i); // go through the query sequence and get a kmer number
if(timesKmerFound[kmerNumber] == 0){ // if we haven't seen it before...
for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
matches[kmerLocations[kmerNumber][j]]++; // that kmer
}
for(int i=0;i<numSeqs;i++){ // find the db sequence that shares the most kmers with
- if(matches[i] > searchScore){ // the query sequence
+ if(matches[i] > searchScore){ // the query sequence
searchScore = matches[i];
maxSequence = i;
}
}
- searchScore = 100 * searchScore / (float)numKmers;
- return templateSequences[maxSequence]; // return the Sequence object corresponding to the db
- // sequence with the most shared kmers.
+
+ searchScore = 100 * searchScore / (float) numKmers; // return the Sequence object corresponding to the db
+ return templateSequences[maxSequence]; // sequence with the most shared kmers.
}
/**************************************************************************************************/
}
+/***********************************************************************/
+
+inline int getNumSeqs(ifstream& file){
+
+ int numSeqs = count(istreambuf_iterator<char>(file),istreambuf_iterator<char>(), '>');
+ file.seekg(0);
+ return numSeqs;
+
+}
+
/***********************************************************************/
//This function parses the estimator options and puts them in a vector
ScreenSeqsCommand::ScreenSeqsCommand(){
try {
globaldata = GlobalData::getInstance();
-
- if(globaldata->getFastaFile() != "") { readSeqs = new ReadFasta(globaldata->inputFileName); }
- else if(globaldata->getNexusFile() != "") { readSeqs = new ReadNexus(globaldata->inputFileName); }
- else if(globaldata->getClustalFile() != "") { readSeqs = new ReadClustal(globaldata->inputFileName); }
- else if(globaldata->getPhylipFile() != "") { readSeqs = new ReadPhylip(globaldata->inputFileName); }
-
- readSeqs->read();
- db = readSeqs->getDB();
- numSeqs = db->size();
+ if(globaldata->getFastaFile() == "") { cout << "you must provide a fasta formatted file" << endl; }
}
catch(exception& e) {
cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
//***************************************************************************************************************
-ScreenSeqsCommand::~ScreenSeqsCommand(){
- delete readSeqs;
-}
+ScreenSeqsCommand::~ScreenSeqsCommand(){ /* do nothing */ }
//***************************************************************************************************************
convert(globaldata->getMinLength(), minLength);
convert(globaldata->getMaxLength(), maxLength);
+ ifstream inFASTA;
+ openInputFile(globaldata->getFastaFile(), inFASTA);
+
set<string> badSeqNames;
string goodSeqFile = getRootName(globaldata->inputFileName) + "good" + getExtension(globaldata->inputFileName);
ofstream goodSeqOut; openOutputFile(goodSeqFile, goodSeqOut);
ofstream badSeqOut; openOutputFile(badSeqFile, badSeqOut);
-
- for(int i=0;i<numSeqs;i++){
- Sequence currSeq = db->get(i);
+
+ while(!inFASTA.eof()){
+ Sequence currSeq(inFASTA);
bool goodSeq = 1; // innocent until proven guilty
- if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
- if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
- if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
- if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()){ goodSeq = 0; }
- if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
- if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
+ if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
+ if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
+ if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
+ if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
+ if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
+ if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
if(goodSeq == 1){
currSeq.printSequence(goodSeqOut);
currSeq.printSequence(badSeqOut);
badSeqNames.insert(currSeq.getName());
}
- }
-
- if(globaldata->getNameFile() != ""){
- screenNameGroupFile(badSeqNames);
-
- }
+ gobble(inFASTA);
+ }
return 0;
}
//***************************************************************************************************************
+void ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
+
+ ifstream inputGroups;
+ openInputFile(globaldata->getGroupFile(), inputGroups);
+ string seqName, group;
+ set<string>::iterator it;
+
+ string goodGroupFile = getRootName(globaldata->getGroupFile()) + "good" + getExtension(globaldata->getGroupFile());
+ string badGroupFile = getRootName(globaldata->getGroupFile()) + "bad" + getExtension(globaldata->getGroupFile());
+
+ ofstream goodGroupOut; openOutputFile(goodGroupFile, goodGroupOut);
+ ofstream badGroupOut; openOutputFile(badGroupFile, badGroupOut);
+
+ while(!inputGroups.eof()){
+ inputGroups >> seqName >> group;
+ it = badSeqNames.find(seqName);
+
+ if(it != badSeqNames.end()){
+ badSeqNames.erase(it);
+ badGroupOut << seqName << '\t' << group << endl;
+ }
+ else{
+ goodGroupOut << seqName << '\t' << group << endl;
+ }
+ gobble(inputGroups);
+ }
+ inputGroups.close();
+ goodGroupOut.close();
+ badGroupOut.close();
+
+}
+
+//***************************************************************************************************************
+
#include "readnexus.h"
#include "readclustal.h"
#include "readseqsphylip.h"
-#include <set>
using namespace std;
int execute();
private:
void screenNameGroupFile(set<string>);
- int numSeqs;
+ void screenGroupFile(set<string>);
+
GlobalData* globaldata;
- ReadSeqs* readSeqs;
- SequenceDB* db;
};
#endif
SeqSummaryCommand::SeqSummaryCommand(){
try {
globaldata = GlobalData::getInstance();
-
- if(globaldata->getFastaFile() != "") { readSeqs = new ReadFasta(globaldata->inputFileName); }
- else if(globaldata->getNexusFile() != "") { readSeqs = new ReadNexus(globaldata->inputFileName); }
- else if(globaldata->getClustalFile() != "") { readSeqs = new ReadClustal(globaldata->inputFileName); }
- else if(globaldata->getPhylipFile() != "") { readSeqs = new ReadPhylip(globaldata->inputFileName); }
-
- readSeqs->read();
- db = readSeqs->getDB();
- numSeqs = db->size();
+ if(globaldata->getFastaFile() == "") { cout << "you need to at least enter a fasta file name" << endl; }
}
catch(exception& e) {
cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
//***************************************************************************************************************
-SeqSummaryCommand::~SeqSummaryCommand(){
- delete readSeqs;
-}
+SeqSummaryCommand::~SeqSummaryCommand(){ /* do nothing */ }
//***************************************************************************************************************
int SeqSummaryCommand::execute(){
try{
-
- ofstream outfile;
- string summaryFile = getRootName(globaldata->inputFileName) + "summary";
- openOutputFile(summaryFile, outfile);
- vector<int> startPosition(numSeqs, 0);
- vector<int> endPosition(numSeqs, 0);
- vector<int> seqLength(numSeqs, 0);
- vector<int> ambigBases(numSeqs, 0);
- vector<int> longHomoPolymer(numSeqs, 0);
+ ifstream inFASTA;
+ openInputFile(globaldata->getFastaFile(), inFASTA);
+ int numSeqs = 0;
+
+ ofstream outSummary;
+ string summaryFile = globaldata->getFastaFile() + ".summary";
+ openOutputFile(summaryFile, outSummary);
- if(db->get(0).getIsAligned() == 1){
- outfile << "seqname\tstart\tend\tlength\tambiguities\tlonghomopolymer" << endl;
- for(int i = 0; i < numSeqs; i++) {
- Sequence current = db->get(i);
- startPosition[i] = current.getStartPos();
- endPosition[i] = current.getEndPos();
- seqLength[i] = current.getNumBases();
- ambigBases[i] = current.getAmbigBases();
- longHomoPolymer[i] = current.getLongHomoPolymer();
- outfile << current.getName() << '\t' << startPosition[i] << '\t' << endPosition[i] << '\t' << seqLength[i] << '\t' << ambigBases[i] << '\t' << longHomoPolymer[i] << endl;
- }
- }
- else{
- outfile << "seqname\tlength\tambiguities\tlonghomopolymer" << endl;
- for(int i=0;i<numSeqs;i++){
- Sequence current = db->get(i);
- seqLength[i] = current.getNumBases();
- ambigBases[i] = current.getAmbigBases();
- longHomoPolymer[i] = current.getLongHomoPolymer();
- outfile << current.getName() << '\t' << seqLength[i] << '\t' << ambigBases[i] << '\t' << longHomoPolymer[i] << endl;
- }
+ vector<int> startPosition;
+ vector<int> endPosition;
+ vector<int> seqLength;
+ vector<int> ambigBases;
+ vector<int> longHomoPolymer;
+
+ outSummary << "seqname\tstart\tend\tnbases\tambigs\tpolymer" << endl;
+
+ while(!inFASTA.eof()){
+ Sequence current(inFASTA);
+ startPosition.push_back(current.getStartPos());
+ endPosition.push_back(current.getEndPos());
+ seqLength.push_back(current.getNumBases());
+ ambigBases.push_back(current.getAmbigBases());
+ longHomoPolymer.push_back(current.getLongHomoPolymer());
+
+ outSummary << current.getName() << '\t';
+ outSummary << current.getStartPos() << '\t' << current.getEndPos() << '\t';
+ outSummary << current.getNumBases() << '\t' << current.getAmbigBases() << '\t';
+ outSummary << current.getLongHomoPolymer() << endl;
+
+ numSeqs++;
+ gobble(inFASTA);
}
+ inFASTA.close();
+ sort(startPosition.begin(), startPosition.end());
+ sort(endPosition.begin(), endPosition.end());
sort(seqLength.begin(), seqLength.end());
sort(ambigBases.begin(), ambigBases.end());
sort(longHomoPolymer.begin(), longHomoPolymer.end());
- int median = int(numSeqs * 0.500);
- int lowestPtile = int(numSeqs * 0.025);
- int lowPtile = int(numSeqs * 0.250);
- int highPtile = int(numSeqs * 0.750);
- int highestPtile = int(numSeqs * 0.975);
- int max = numSeqs - 1;
+ int ptile0_25 = int(numSeqs * 0.025);
+ int ptile25 = int(numSeqs * 0.250);
+ int ptile50 = int(numSeqs * 0.500);
+ int ptile75 = int(numSeqs * 0.750);
+ int ptile97_5 = int(numSeqs * 0.975);
+ int ptile100 = numSeqs - 1;
cout << endl;
- if(db->get(0).getIsAligned() == 1){
- sort(startPosition.begin(), startPosition.end());
- sort(endPosition.begin(), endPosition.end());
-
- cout << "\t\tStart\tEnd\tLength\tN's\tPolymer" << endl;
- cout << "Minimum:\t" << startPosition[0] << '\t' << endPosition[0] << '\t' << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl;
- cout << "2.5%-tile:\t" << startPosition[lowestPtile] << '\t' << endPosition[lowestPtile] << '\t' << seqLength[lowestPtile] << '\t' << ambigBases[lowestPtile] << '\t' << longHomoPolymer[lowestPtile] << endl;
- cout << "25%-tile:\t" << startPosition[lowPtile] << '\t' << endPosition[lowPtile] << '\t' << seqLength[lowPtile] << '\t' << ambigBases[lowPtile] << '\t' << longHomoPolymer[lowPtile] << endl;
- cout << "Median: \t" << startPosition[median] << '\t' << endPosition[median] << '\t' << seqLength[median] << '\t' << ambigBases[median] << '\t' << longHomoPolymer[median] << endl;
- cout << "75%-tile:\t" << startPosition[highPtile] << '\t' << endPosition[highPtile] << '\t' << seqLength[highPtile] << '\t' << ambigBases[highPtile] << '\t' << longHomoPolymer[highPtile] << endl;
- cout << "97.5%-tile:\t" << startPosition[highestPtile] << '\t' << endPosition[highestPtile] << '\t' << seqLength[highestPtile] << '\t' << ambigBases[highestPtile] << '\t' << longHomoPolymer[highestPtile] << endl;
- cout << "Maximum:\t" << startPosition[max] << '\t' << endPosition[max] << '\t' << seqLength[max] << '\t' << ambigBases[max] << '\t' << longHomoPolymer[max] << endl;
- }
- else{
- cout << "\t\tLength\tN's\tPolymer" << endl;
- cout << "Minimum:\t" << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl;
- cout << "2.5%-tile:\t" << seqLength[lowestPtile] << '\t' << ambigBases[lowestPtile] << '\t' << longHomoPolymer[lowestPtile] << endl;
- cout << "25%-tile:\t" << seqLength[lowPtile] << '\t' << ambigBases[lowPtile] << '\t' << longHomoPolymer[lowPtile] << endl;
- cout << "Median: \t" << seqLength[median] << '\t' << ambigBases[median] << '\t' << longHomoPolymer[median] << endl;
- cout << "75%-tile:\t"<< seqLength[highPtile] << '\t' << ambigBases[highPtile] << '\t' << longHomoPolymer[highPtile] << endl;
- cout << "97.5%-tile:\t"<< seqLength[highestPtile] << '\t' << ambigBases[highestPtile] << '\t' << longHomoPolymer[highestPtile] << endl;
- cout << "Maximum:\t" << seqLength[max] << '\t' << ambigBases[max] << '\t' << longHomoPolymer[max] << endl;
- }
+ cout << "\t\tStart\tEnd\tNBases\tAmbigs\tPolymer" << endl;
+ cout << "Minimum:\t" << startPosition[0] << '\t' << endPosition[0] << '\t' << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl;
+ cout << "2.5%-tile:\t" << startPosition[ptile0_25] << '\t' << endPosition[ptile0_25] << '\t' << seqLength[ptile0_25] << '\t' << ambigBases[ptile0_25] << '\t' << longHomoPolymer[ptile0_25] << endl;
+ cout << "25%-tile:\t" << startPosition[ptile25] << '\t' << endPosition[ptile25] << '\t' << seqLength[ptile25] << '\t' << ambigBases[ptile25] << '\t' << longHomoPolymer[ptile25] << endl;
+ cout << "Median: \t" << startPosition[ptile50] << '\t' << endPosition[ptile50] << '\t' << seqLength[ptile50] << '\t' << ambigBases[ptile50] << '\t' << longHomoPolymer[ptile50] << endl;
+ cout << "75%-tile:\t" << startPosition[ptile75] << '\t' << endPosition[ptile75] << '\t' << seqLength[ptile75] << '\t' << ambigBases[ptile75] << '\t' << longHomoPolymer[ptile75] << endl;
+ cout << "97.5%-tile:\t" << startPosition[ptile97_5] << '\t' << endPosition[ptile97_5] << '\t' << seqLength[ptile97_5] << '\t' << ambigBases[ptile97_5] << '\t' << longHomoPolymer[ptile97_5] << endl;
+ cout << "Maximum:\t" << startPosition[ptile100] << '\t' << endPosition[ptile100] << '\t' << seqLength[ptile100] << '\t' << ambigBases[ptile100] << '\t' << longHomoPolymer[ptile100] << endl;
cout << "# of Seqs:\t" << numSeqs << endl;
return 0;
int execute();
private:
- int numSeqs;
GlobalData* globaldata;
- ReadSeqs* readSeqs;
- SequenceDB* db;
+
};
#endif
name = newName;
if(sequence.find_first_of('-') != string::npos) {
setAligned(sequence);
- isAligned = 1;
}
setUnaligned(sequence);
//********************************************************************************************************************
Sequence::Sequence(ifstream& fastaFile){
+ initialize();
string accession; // provided a file handle to a fasta-formatted sequence file, read in the next
fastaFile >> accession; // accession number and sequence we find...
}
}
}
-
+ isAligned = 1;
}
//********************************************************************************************************************
if(endPos == -1){
for(int j = 0; j < alignmentLength; j++) {
if(aligned[j] != '.'){
- startPos = j;
+ startPos = j + 1;
break;
}
}
- }
+ }
+ if(isAligned == 0){ startPos = 1; }
+
return startPos;
}
if(endPos == -1){
for(int j=alignmentLength-1;j>=0;j--){
if(aligned[j] != '.'){
- endPos = j;
+ endPos = j + 1;
break;
}
}
}
+ if(isAligned == 0){ endPos = numBases; }
+
return endPos;
}
}
//********************************************************************************************************************
+
+void Sequence::reverseComplement(){
+
+ string temp;
+ for(int i=numBases-1;i>=0;i--){
+ if(unaligned[i] == 'A') { temp += 'T'; }
+ else if(unaligned[i] == 'T'){ temp += 'A'; }
+ else if(unaligned[i] == 'G'){ temp += 'C'; }
+ else if(unaligned[i] == 'C'){ temp += 'G'; }
+ else { temp += 'N'; }
+ }
+ unaligned = temp;
+
+}
+
+//********************************************************************************************************************
void setPairwise(string);
void setAligned(string);
void setLength();
+ void reverseComplement();
string convert2ints();
string getName();
string summaryseqsArray[] = {"fasta","phylip","clustal","nexus"};
commandParameters["summary.seqs"] = addParameters(summaryseqsArray, sizeof(summaryseqsArray)/sizeof(string));
- string screenseqsArray[] = {"fasta","phylip","clustal","nexus", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"};
+ string screenseqsArray[] = {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"};
commandParameters["screen.seqs"] = addParameters(screenseqsArray, sizeof(screenseqsArray)/sizeof(string));
string vennArray[] = {"groups","line","label","calc"};