*p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p;
float val = (float)hmapIter->second;
memcpy(p, &val, 4);
- samwrite(out, tmp_b);
+ if (tmp_b->core.qual > 0) samwrite(out, tmp_b); // output only when MAPQ > 0
bam_destroy1(tmp_b); // now hmapIter->b makes no sense
}
hmap.clear();
*p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p;
float val = (float)hmapIter->second;
memcpy(p, &val, 4);
- samwrite(out, tmp_b);
+ if (tmp_b->core.qual > 0) samwrite(out, tmp_b); // If MAPQ is equal to 0, do not output this alignment
bam_destroy1(tmp_b); // now hmapIter->b makes no sense
}
hmap.clear();
float val = (float)hmapIter->second;
memcpy(p, &val, 4);
memcpy(p2, &val, 4);
-
- samwrite(out, tmp_b);
- samwrite(out, tmp_b2);
+
+ // If MAPQ is equal to 0, do not output this alignment pair
+ if (tmp_b->core.qual > 0) {
+ samwrite(out, tmp_b);
+ samwrite(out, tmp_b2);
+ }
bam_destroy1(tmp_b);
bam_destroy1(tmp_b2);
memcpy(p, &val, 4);
memcpy(p2, &val, 4);
- samwrite(out, tmp_b);
- samwrite(out, tmp_b2);
+ if (tmp_b->core.qual > 0) {
+ samwrite(out, tmp_b);
+ samwrite(out, tmp_b2);
+ }
bam_destroy1(tmp_b);
bam_destroy1(tmp_b2);
#include<pthread.h>
#include "utils.h"
+#include "sampling.h"
#include "Read.h"
#include "SingleRead.h"
bool genBamF; // If user wants to generate bam file, true; otherwise, false.
+bool bamSampling; // true if sampling from read posterior distribution when bam file is generated
bool updateModel, calcExpectedWeights;
bool genGibbsOut; // generate file for Gibbs sampler
delete[] mhps;
}
-int tmp_n;
-
inline bool doesUpdateModel(int ROUND) {
// return ROUND <= 20 || ROUND % 100 == 0;
return ROUND <= 10;
sprintf(chr_list, "%s.chrlist", refName);
pt_chr_list = (char*)(&chr_list);
}
+
+ if (bamSampling) {
+ int local_N;
+ int fr, to, len, id;
+ vector<double> arr;
+ arr.clear();
+
+ if (verbose) printf("Begin to sample reads from their posteriors.\n");
+ for (int i = 0; i < nThreads; i++) {
+ local_N = hitvs[i]->getN();
+ for (int j = 0; j < local_N; j++) {
+ fr = hitvs[i]->getSAt(j);
+ to = hitvs[i]->getSAt(j + 1);
+ len = to - fr + 1;
+ arr.resize(len);
+ arr[0] = ncpvs[i][j];
+ for (int k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb();
+ id = (arr[len - 1] < EPSILON ? -1 : sample(arr, len)); // if all entries in arr are 0, let id be -1
+ for (int k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0);
+ }
+ }
+ }
+ if (verbose) printf("Sampling is finished.\n");
BamWriter writer(inpSamType, inpSamF, pt_fn_list, outBamF, pt_chr_list);
HitWrapper<HitType> wrapper(nThreads, hitvs);
bool quiet = false;
if (argc < 5) {
- printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n");
+ printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
printf(" refName: reference name\n");
printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
printf(" sampleName: sample's name, including the path\n");
printf(" -p: number of threads which user wants to use. (default: 1)\n");
printf(" -b: produce bam format output file. (default: off)\n");
printf(" -q: set it quiet\n");
- printf(" --gibbs-out: generate output file use by Gibbs sampler. (default: off)\n");
+ printf(" --gibbs-out: generate output file used by Gibbs sampler. (default: off)\n");
+ printf(" --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n");
printf("// model parameters should be in imdName.mparams.\n");
exit(-1);
}
nThreads = 1;
genBamF = false;
+ bamSampling = false;
genGibbsOut = false;
pt_fn_list = pt_chr_list = NULL;
}
if (!strcmp(argv[i], "-q")) { quiet = true; }
if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; }
+ if (!strcmp(argv[i], "--sampling")) { bamSampling = true; }
}
if (nThreads <= 0) { fprintf(stderr, "Number of threads should be bigger than 0!\n"); exit(-1); }
//assert(nThreads > 0);
-#include<ctime>
#include<cstdio>
#include<cstring>
#include<cstdlib>
#include<sstream>
#include<vector>
-#include "boost/random.hpp"
-
#include "utils.h"
+#include "sampling.h"
#include "Model.h"
#include "SingleModel.h"
vector<double> arr;
-boost::mt19937 rng(time(NULL));
-boost::uniform_01<boost::mt19937> rg(rng);
-
void load_data(char* reference_name, char* statName, char* imdName) {
ifstream fin;
string line;
if (verbose) { printf("Loading Data is finished!\n"); }
}
-// arr should be cumulative!
-// interval : [,)
-// random number should be in [0, arr[len - 1])
-// If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1
-int sample(vector<double>& arr, int len) {
- int l, r, mid;
- double prb = rg() * arr[len - 1];
-
- l = 0; r = len - 1;
- while (l <= r) {
- mid = (l + r) / 2;
- if (arr[mid] <= prb) l = mid + 1;
- else r = mid - 1;
- }
-
- if (l >= len) { printf("%d %lf %lf\n", len, arr[len - 1], prb); }
- assert(l < len);
-
- return l;
-}
-
void init() {
int len, fr, to;
#include<cassert>
#include<string>
+#include<stdint.h>
+
#include "sam/bam.h"
#include "sam/sam.h"
cur_tid = -1;
wig_arr = NULL;
while (samread(bam_in, b) >= 0) {
+ if (b->core.flag & 0x0004) continue;
+
if (b->core.tid != cur_tid) {
if (cur_tid >= 0) generateWiggle(cur_tid);
cur_tid = b->core.tid;
memset(wig_arr, 0, len);
}
- float w = bam_aux2f(bam_aux_get(b, "ZW"));
+ uint8_t *p_tag = bam_aux_get(b, "ZW");
+ float w = (p_tag != NULL ? bam_aux2f(p_tag) : 1.0);
int pos = b->core.pos;
uint32_t *p = bam1_cigar(b);
$(CC) -O3 buildReadIndex.cpp -o rsem-build-read-index
+simul.h : boost/random.hpp
+
ReadReader.h : SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h ReadIndex.h
SingleModel.h : utils.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h SingleHit.h ReadReader.h simul.h
BamWriter.h : sam/sam.h sam/bam.h utils.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h
+sampling.h : boost/random.hpp
+
rsem-run-em : EM.o sam/libbam.a
$(CC) -o rsem-run-em EM.o sam/libbam.a -lz -lpthread
-EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h EM.cpp simul.h
+EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sampling.h boost/random.hpp EM.cpp
$(CC) $(COFLAGS) EM.cpp
rsem-bam2wig : sam/bam.h sam/sam.h sam/libbam.a bam2wig.cpp
$(CC) -o rsem-run-gibbs Gibbs.o -lpthread
#some header files are omitted
-Gibbs.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h boost/random.hpp Gibbs.cpp
+Gibbs.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h sampling.h boost/random.hpp Gibbs.cpp
$(CC) $(COFLAGS) Gibbs.cpp
rsem-calculate-credibility-intervals : calcCI.o
my $nThreads = 1;
my $genBamF = 0;
+my $sampling = 0;
my $calcCI = 0;
my $quiet = 0;
my $help = 0;
"num-rspd-bins=i" => \$B,
"p|num-threads=i" => \$nThreads,
"out-bam" => \$genBamF,
+ "sampling-for-bam" => \$sampling,
"calc-ci" => \$calcCI,
"ci-memory=i" => \$NMB,
"time" => \$mTime,
pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1);
pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1);
pod2usage(-msg => "Seed length should be at least 25!\n", -exitval => 2, -verbose => 2) if ($L < 25);
+pod2usage(-msg => "--sampling-for-bam cannot be specified if --out-bam is not specified!\n", -exitval => 2, -verbose => 2) if ($sampling && !$genBamF);
if ($strand_specific) { $probF = 1.0; }
$command .= " -b $samInpType $inpF";
if ($fn_list ne "") { $command .= " 1 $fn_list"; }
else { $command .= " 0"; }
+ if ($sampling) { $command .= " --sampling"; }
}
if ($calcCI) { $command .= " --gibbs-out"; }
if ($quiet) { $command .= " -q"; }
Generate a BAM file, 'sample_name.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' will be generated. (Default: off)
+=item B<--sampling-for-bam>
+
+When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled and outputed according to the posterior probabilities. If the sampling result is that the read comes from the "noise" transcript, nothing is outputed. It cannot be specified unless --out-bam is specified. (Default: off)
+
=item B<--calc-ci>
Calculate 95% credibility intervals and posterior mean estimates. (Default: off)
--- /dev/null
+#ifndef SAMPLING
+#define SAMPLING
+
+#include<ctime>
+#include<cstdio>
+#include<cassert>
+#include<vector>
+
+#include "boost/random.hpp"
+
+boost::mt19937 rng(time(NULL));
+boost::uniform_01<boost::mt19937> rg(rng);
+
+// arr should be cumulative!
+// interval : [,)
+// random number should be in [0, arr[len - 1])
+// If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1
+int sample(std::vector<double>& arr, int len) {
+ int l, r, mid;
+ double prb = rg() * arr[len - 1];
+
+ l = 0; r = len - 1;
+ while (l <= r) {
+ mid = (l + r) / 2;
+ if (arr[mid] <= prb) l = mid + 1;
+ else r = mid - 1;
+ }
+
+ if (l >= len) { printf("%d %lf %lf\n", len, arr[len - 1], prb); }
+ assert(l < len);
+
+ return l;
+}
+
+#endif