variable-length reads and RSPD estimation. It can also generate
genomic-coordinate BAM files and UCSC wiggle files for visualization. In
addition, it provides posterior mean and 95% credibility interval
-estimates for expression levels.
+estimates for expression levels.
## <a name="compilation"></a> Compilation & Installation
alternative aligner, you may also want to provide the --no-bowtie option
to rsem-prepare-reference so that the Bowtie indices are not built.
+However, please note that RSEM does ** not ** support gapped
+alignments. So make sure that your aligner does not produce alignments
+with intersions/deletions. Also, please make sure that you use
+'reference_name.idx.fa' , which is generated by RSEM, to build your
+aligner's indices.
+
### III. Visualization
RSEM contains a version of samtools in the 'sam' subdirectory. When
Usage:
- rsem-plot-model modelF outF
+ rsem-plot-model sample_name outF
-modelF: the sample_name.model file generated by RSEM
+sample_name: the name of the sample analyzed
outF: the file name for plots generated from the model. It is a pdf file
The plots generated depends on read type and user configuration. It
may include fragment length distribution, mate length distribution,
read start position distribution (RSPD), quality score vs observed
quality given a reference base, position vs percentage of sequencing
-error given a reference base.
+error given a reference base and histogram of read alignments.
fragment length distribution and mate length distribution: x-axis is fragment/mate length, y axis is the probability of generating a fragment/mate with the associated length
Quality score vs. observed quality given a reference base: x-axis is Phred quality scores associated with data, y-axis is the "observed quality", Phred quality scores learned by RSEM from the data. Q = -10log_10(P), where Q is Phred quality score and P is the probability of sequencing error for a particular base
Position vs. percentage sequencing error given a reference base: x-axis is position and y-axis is percentage sequencing error
+
+Histogram of read alignments: x-axis is the number of alignments a read has and y-axis is the number of such reads
## <a name="example"></a> Example
## <a name="acknowledgements"></a> Acknowledgements
-RSEM uses randomc.h and mersenne.cpp from
-<http://lxnt.info/rng/randomc.htm> for random number generation. RSEM
-also uses the [Boost C++](http://www.boost.org) and
+RSEM uses the [Boost C++](http://www.boost.org) and
[samtools](http://samtools.sourceforge.net) libraries.
## <a name="license"></a> License
std::vector<unsigned int> fmasks; // record masks for forward strand, each position occupies 1 bit
};
-//internal read; option 0 : read all 1 : do not read seqence and name
+//internal read; option 0 : read all 1 : do not read seqences
bool RefSeq::read(std::ifstream& fin, int option) {
std::string line;
getline(fin, line);
assert(option == 0 || option == 1);
- if (option == 1) { name = seq = ""; }
+ if (option == 1) { seq = ""; }
return true;
}
}
//inpF in fasta format, with sequence all in one line together
-//option 0 read all, 1 do not read sequences and names
+//option 0 read all, 1 do not read sequences
void Refs::loadRefs(char *inpF, int option) {
std::ifstream fin;
RefSeq seq;
#include "utils.h"
-#include "Transcript.h"
-#include "Transcripts.h"
+#include "RefSeq.h"
+#include "Refs.h"
#include "SingleRead.h"
#include "SingleReadQ.h"
class SamParser {
public:
- SamParser(char, const char*, Transcripts&, const char* = 0);
+ SamParser(char, const char*, Refs&, const char* = 0);
~SamParser();
/**
char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
// aux, if not 0, points to the file name of fn_list
-SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux) {
+SamParser::SamParser(char inpType, const char* inpF, Refs& refs, const char* aux) {
switch(inpType) {
case 'b': sam_in = samopen(inpF, "rb", aux); break;
case 's': sam_in = samopen(inpF, "r", aux); break;
if (header == 0) { fprintf(stderr, "Fail to parse sam header!\n"); exit(-1); }
// Check if the reference used for aligner is the transcript set RSEM generated
- if (transcripts.getM() != header->n_targets) {
+ if (refs.getM() != header->n_targets) {
fprintf(stderr, "Number of transcripts does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
exit(-1);
}
for (int i = 0; i < header->n_targets; i++) {
- const Transcript& transcript = transcripts.getTranscriptAt(i + 1);
+ const RefSeq& refseq = refs.getRef(i + 1);
// If update int to long, chance the (int) conversion
- if (transcript.getTranscriptID().compare(header->target_name[i]) != 0 || transcript.getLength() != (int)header->target_len[i]) {
+ if (refseq.getName().compare(header->target_name[i]) != 0 || refseq.getTotLen() != (int)header->target_len[i]) {
fprintf(stderr, "Transcript information does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
exit(-1);
}
HitContainer.h : GroupInfo.h
-SamParser.h : sam/sam.h sam/bam.h utils.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcript.h Transcripts.h
+SamParser.h : sam/sam.h sam/bam.h utils.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h RefSeq.h Refs.h
rsem-parse-alignments : parseIt.o sam/libbam.a
$(CC) -o rsem-parse-alignments parseIt.o sam/libbam.a -lz
-parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h Transcript.h Transcripts.h sam/sam.h sam/bam.h parseIt.cpp
+parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h RefSeq.h Refs.h sam/sam.h sam/bam.h parseIt.cpp
$(CC) $(COFLAGS) parseIt.cpp
#include "GroupInfo.h"
-#include "Transcript.h"
-#include "Transcripts.h"
+#include "RefSeq.h"
+#include "Refs.h"
#include "SingleRead.h"
#include "SingleReadQ.h"
int nHits; // # of hits
int nUnique, nMulti, nIsoMulti;
char fn_list[STRLEN];
-char groupF[STRLEN], tiF[STRLEN];
+char refF[STRLEN], groupF[STRLEN];
char imdName[STRLEN];
char datF[STRLEN], cntF[STRLEN];
+Refs refs;
GroupInfo gi;
-Transcripts transcripts;
SamParser *parser;
ofstream hit_out;
char* aux = 0;
if (strcmp(fn_list, "")) aux = fn_list;
- parser = new SamParser(alignFType, alignF, transcripts, aux);
+ parser = new SamParser(alignFType, alignF, refs, aux);
memset(cat, 0, sizeof(cat));
memset(readOutFs, 0, sizeof(readOutFs));
if (record_val >= 0) {
record_read.write(n_os, cat[record_val]);
++N[record_val];
+ }
+ // flush out previous read's hits if the read is alignable reads
+ if (record_val == 1) {
hits.updateRI();
nHits += hits.getNHits();
nMulti += hits.calcNumGeneMultiReads(gi);
if (record_val >= 0) {
record_read.write(n_os, cat[record_val]);
++N[record_val];
+ }
+
+ if (record_val == 1) {
hits.updateRI();
nHits += hits.getNHits();
nMulti += hits.calcNumGeneMultiReads(gi);
verbose = !quiet;
+ sprintf(refF, "%s.seq", argv[1]);
+ refs.loadRefs(refF, 1);
sprintf(groupF, "%s.grp", argv[1]);
gi.load(groupF);
- sprintf(tiF, "%s.ti", argv[1]);
- transcripts.readFrom(tiF);
sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
sprintf(datF, "%s.dat", imdName);
fout<<N[0]<<" "<<N[1]<<" "<<N[2]<<" "<<(N[0] + N[1] + N[2])<<endl;
fout<<nUnique<<" "<<nMulti<<" "<<nIsoMulti<<endl;
fout<<nHits<<" "<<read_type<<endl;
+ fout<<"0\t"<<N[0]<<endl;
for (iter = counter.begin(); iter != counter.end(); iter++) {
fout<<iter->first<<'\t'<<iter->second<<endl;
}
+ fout<<"Inf\t"<<N[2]<<endl;
fout.close();
release();
}
if (!$keep_intermediate_files) {
- $status = system ("rm -rf $tmp_dir");
+ $status = system ("rm -rf $temp_dir");
if ($status != 0) {
print "Fail to delete the temporary folder!\n";
exit(-1);
strvec <- strsplit(argv[1], split = "/")[[1]]
token <- strvec[length(strvec)]
-modelF <- paste(argv[1], ".stat/", token, ".model")
-cntF <- paste(argv[1], ".stat/", token, ".cnt")
+modelF <- paste(argv[1], ".stat/", token, ".model", sep = "")
+cntF <- paste(argv[1], ".stat/", token, ".cnt", sep = "")
pdf(argv[2])
# QProfile
readLines(con, n = 1)
+ x <- c()
peA <- c() # probability of sequencing error given reference base is A
peC <- c()
peG <- c()
peT <- c()
-
+
for (i in 1 : N) {
strvec <- readLines(con, n = 6)
list <- strsplit(strvec[1:4], split = " ")
+
vecA <- as.numeric(list[[1]])
vecC <- as.numeric(list[[2]])
vecG <- as.numeric(list[[3]])
vecT <- as.numeric(list[[4]])
- if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break
- peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecA[1])))
- peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecC[2])))
- peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecG[3])))
- peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecT[4])))
+
+ if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) next
+ x <- c(x, (i - 1))
+ peA <- c(peA, ifelse(sum(vecA) < 1e-8, NA, -10 * log(1.0 - vecA[1])))
+ peC <- c(peC, ifelse(sum(vecC) < 1e-8, NA, -10 * log(1.0 - vecC[2])))
+ peG <- c(peG, ifelse(sum(vecG) < 1e-8, NA, -10 * log(1.0 - vecG[3])))
+ peT <- c(peT, ifelse(sum(vecT) < 1e-8, NA, -10 * log(1.0 - vecT[4])))
}
- x <- 0 : (length(peA) - 1)
- matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4, main = "Quality Score vs. Observed Quality", xlab = "Quality Score", ylab = "Observed Quality")
+ matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4, main = "Phred Quality Score vs. Observed Quality", xlab = "Quality Score", ylab = "Observed Quality")
legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
} else {
# Profile
readLines(con, n = 1)
-
+
+ x <- c()
peA <- c() # probability of sequencing error given reference base is A
peC <- c()
peG <- c()
for (i in 1: maxL) {
strvec <- readLines(con, n = 6)
list <- strsplit(strvec[1:4], split = " ")
+
vecA <- as.numeric(list[[1]])
vecC <- as.numeric(list[[2]])
vecG <- as.numeric(list[[3]])
vecT <- as.numeric(list[[4]])
- if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break
- peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecA[1]) * 100))
- peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecC[2]) * 100))
- peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecG[3]) * 100))
- peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecT[4]) * 100))
+
+ if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) next
+ x <- c(x, i)
+ peA <- c(peA, ifelse(sum(vecA) < 1e-8, NA, (1.0 - vecA[1]) * 100))
+ peC <- c(peC, ifelse(sum(vecC) < 1e-8, NA, (1.0 - vecC[2]) * 100))
+ peG <- c(peG, ifelse(sum(vecG) < 1e-8, NA, (1.0 - vecG[3]) * 100))
+ peT <- c(peT, ifelse(sum(vecT) < 1e-8, NA, (1.0 - vecT[4]) * 100))
}
- x <- 1 : length(peA)
matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4, main = "Position vs. Percentage Sequence Error", xlab = "Position", ylab = "Percentage of Sequencing Error")
legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
}
close(con)
pair <- read.table(file = cntF, skip = 3, sep = "\t")
-plot(pair[,1], pair[,2], xlab = "Number of Alignments", ylab = "Number of Reads", main = "Among alignable reads, distribution of # of alignments")
+barplot(pair[,2], names.arg = pair[,1], xlab = "Number of Alignments", ylab = "Number of Reads", main = "Histogram of Read Alignments")
dev.off()