From 1c7a81621434a852a7f7e11d61621e50ba1b7f2a Mon Sep 17 00:00:00 2001 From: Bo Li Date: Wed, 9 Nov 2011 23:08:33 -0600 Subject: [PATCH] rsem v1.1.14, add --sampling-for-bam option, modify rsem-bam2wig to handle BAM files which are not produced by RSEM --- BamWriter.h | 19 ++++++++++++------- EM.cpp | 34 ++++++++++++++++++++++++++++++---- Gibbs.cpp | 28 +--------------------------- bam2wig.cpp | 7 ++++++- makefile | 8 ++++++-- rsem-calculate-expression | 8 ++++++++ sampling.h | 35 +++++++++++++++++++++++++++++++++++ 7 files changed, 98 insertions(+), 41 deletions(-) create mode 100644 sampling.h diff --git a/BamWriter.h b/BamWriter.h index b39400f..e29e36b 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -263,7 +263,7 @@ void BamWriter::work(HitWrapper wrapper, Transcripts& transcripts) { *p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p; float val = (float)hmapIter->second; memcpy(p, &val, 4); - samwrite(out, tmp_b); + if (tmp_b->core.qual > 0) samwrite(out, tmp_b); // output only when MAPQ > 0 bam_destroy1(tmp_b); // now hmapIter->b makes no sense } hmap.clear(); @@ -290,7 +290,7 @@ void BamWriter::work(HitWrapper wrapper, Transcripts& transcripts) { *p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p; float val = (float)hmapIter->second; memcpy(p, &val, 4); - samwrite(out, tmp_b); + if (tmp_b->core.qual > 0) samwrite(out, tmp_b); // If MAPQ is equal to 0, do not output this alignment bam_destroy1(tmp_b); // now hmapIter->b makes no sense } hmap.clear(); @@ -429,9 +429,12 @@ void BamWriter::work(HitWrapper wrapper, Transcripts& transcripts) float val = (float)hmapIter->second; memcpy(p, &val, 4); memcpy(p2, &val, 4); - - samwrite(out, tmp_b); - samwrite(out, tmp_b2); + + // If MAPQ is equal to 0, do not output this alignment pair + if (tmp_b->core.qual > 0) { + samwrite(out, tmp_b); + samwrite(out, tmp_b2); + } bam_destroy1(tmp_b); bam_destroy1(tmp_b2); @@ -467,8 +470,10 @@ void BamWriter::work(HitWrapper wrapper, Transcripts& transcripts) memcpy(p, &val, 4); memcpy(p2, &val, 4); - samwrite(out, tmp_b); - samwrite(out, tmp_b2); + if (tmp_b->core.qual > 0) { + samwrite(out, tmp_b); + samwrite(out, tmp_b2); + } bam_destroy1(tmp_b); bam_destroy1(tmp_b2); diff --git a/EM.cpp b/EM.cpp index 322a306..107be3c 100644 --- a/EM.cpp +++ b/EM.cpp @@ -10,6 +10,7 @@ #include #include "utils.h" +#include "sampling.h" #include "Read.h" #include "SingleRead.h" @@ -58,6 +59,7 @@ int nThreads; bool genBamF; // If user wants to generate bam file, true; otherwise, false. +bool bamSampling; // true if sampling from read posterior distribution when bam file is generated bool updateModel, calcExpectedWeights; bool genGibbsOut; // generate file for Gibbs sampler @@ -394,8 +396,6 @@ void release(ReadReader **readers, HitContainer **hitvs, doub delete[] mhps; } -int tmp_n; - inline bool doesUpdateModel(int ROUND) { // return ROUND <= 20 || ROUND % 100 == 0; return ROUND <= 10; @@ -619,6 +619,29 @@ void EM() { sprintf(chr_list, "%s.chrlist", refName); pt_chr_list = (char*)(&chr_list); } + + if (bamSampling) { + int local_N; + int fr, to, len, id; + vector arr; + arr.clear(); + + if (verbose) printf("Begin to sample reads from their posteriors.\n"); + for (int i = 0; i < nThreads; i++) { + local_N = hitvs[i]->getN(); + for (int j = 0; j < local_N; j++) { + fr = hitvs[i]->getSAt(j); + to = hitvs[i]->getSAt(j + 1); + len = to - fr + 1; + arr.resize(len); + arr[0] = ncpvs[i][j]; + for (int k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb(); + id = (arr[len - 1] < EPSILON ? -1 : sample(arr, len)); // if all entries in arr are 0, let id be -1 + for (int k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0); + } + } + } + if (verbose) printf("Sampling is finished.\n"); BamWriter writer(inpSamType, inpSamF, pt_fn_list, outBamF, pt_chr_list); HitWrapper wrapper(nThreads, hitvs); @@ -633,7 +656,7 @@ int main(int argc, char* argv[]) { bool quiet = false; if (argc < 5) { - printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n"); + printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n"); printf(" refName: reference name\n"); printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n"); printf(" sampleName: sample's name, including the path\n"); @@ -641,7 +664,8 @@ int main(int argc, char* argv[]) { printf(" -p: number of threads which user wants to use. (default: 1)\n"); printf(" -b: produce bam format output file. (default: off)\n"); printf(" -q: set it quiet\n"); - printf(" --gibbs-out: generate output file use by Gibbs sampler. (default: off)\n"); + printf(" --gibbs-out: generate output file used by Gibbs sampler. (default: off)\n"); + printf(" --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n"); printf("// model parameters should be in imdName.mparams.\n"); exit(-1); } @@ -657,6 +681,7 @@ int main(int argc, char* argv[]) { nThreads = 1; genBamF = false; + bamSampling = false; genGibbsOut = false; pt_fn_list = pt_chr_list = NULL; @@ -673,6 +698,7 @@ int main(int argc, char* argv[]) { } if (!strcmp(argv[i], "-q")) { quiet = true; } if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; } + if (!strcmp(argv[i], "--sampling")) { bamSampling = true; } } if (nThreads <= 0) { fprintf(stderr, "Number of threads should be bigger than 0!\n"); exit(-1); } //assert(nThreads > 0); diff --git a/Gibbs.cpp b/Gibbs.cpp index 5bdfb24..979860b 100644 --- a/Gibbs.cpp +++ b/Gibbs.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -7,9 +6,8 @@ #include #include -#include "boost/random.hpp" - #include "utils.h" +#include "sampling.h" #include "Model.h" #include "SingleModel.h" @@ -53,9 +51,6 @@ bool quiet; vector arr; -boost::mt19937 rng(time(NULL)); -boost::uniform_01 rg(rng); - void load_data(char* reference_name, char* statName, char* imdName) { ifstream fin; string line; @@ -121,27 +116,6 @@ void load_data(char* reference_name, char* statName, char* imdName) { if (verbose) { printf("Loading Data is finished!\n"); } } -// arr should be cumulative! -// interval : [,) -// random number should be in [0, arr[len - 1]) -// If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1 -int sample(vector& arr, int len) { - int l, r, mid; - double prb = rg() * arr[len - 1]; - - l = 0; r = len - 1; - while (l <= r) { - mid = (l + r) / 2; - if (arr[mid] <= prb) l = mid + 1; - else r = mid - 1; - } - - if (l >= len) { printf("%d %lf %lf\n", len, arr[len - 1], prb); } - assert(l < len); - - return l; -} - void init() { int len, fr, to; diff --git a/bam2wig.cpp b/bam2wig.cpp index fcb86b3..e46fa67 100644 --- a/bam2wig.cpp +++ b/bam2wig.cpp @@ -4,6 +4,8 @@ #include #include +#include + #include "sam/bam.h" #include "sam/sam.h" @@ -61,6 +63,8 @@ int main(int argc, char* argv[]) { cur_tid = -1; wig_arr = NULL; while (samread(bam_in, b) >= 0) { + if (b->core.flag & 0x0004) continue; + if (b->core.tid != cur_tid) { if (cur_tid >= 0) generateWiggle(cur_tid); cur_tid = b->core.tid; @@ -69,7 +73,8 @@ int main(int argc, char* argv[]) { memset(wig_arr, 0, len); } - float w = bam_aux2f(bam_aux_get(b, "ZW")); + uint8_t *p_tag = bam_aux_get(b, "ZW"); + float w = (p_tag != NULL ? bam_aux2f(p_tag) : 1.0); int pos = b->core.pos; uint32_t *p = bam1_cigar(b); diff --git a/makefile b/makefile index 97cba95..a28caa7 100644 --- a/makefile +++ b/makefile @@ -62,6 +62,8 @@ rsem-build-read-index : utils.h buildReadIndex.cpp $(CC) -O3 buildReadIndex.cpp -o rsem-build-read-index +simul.h : boost/random.hpp + ReadReader.h : SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h ReadIndex.h SingleModel.h : utils.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h SingleHit.h ReadReader.h simul.h @@ -76,10 +78,12 @@ HitWrapper.h : HitContainer.h BamWriter.h : sam/sam.h sam/bam.h utils.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h +sampling.h : boost/random.hpp + rsem-run-em : EM.o sam/libbam.a $(CC) -o rsem-run-em EM.o sam/libbam.a -lz -lpthread -EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h EM.cpp simul.h +EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sampling.h boost/random.hpp EM.cpp $(CC) $(COFLAGS) EM.cpp rsem-bam2wig : sam/bam.h sam/sam.h sam/libbam.a bam2wig.cpp @@ -95,7 +99,7 @@ rsem-run-gibbs : Gibbs.o $(CC) -o rsem-run-gibbs Gibbs.o -lpthread #some header files are omitted -Gibbs.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h boost/random.hpp Gibbs.cpp +Gibbs.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h sampling.h boost/random.hpp Gibbs.cpp $(CC) $(COFLAGS) Gibbs.cpp rsem-calculate-credibility-intervals : calcCI.o diff --git a/rsem-calculate-expression b/rsem-calculate-expression index 249ca8b..14679ff 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -45,6 +45,7 @@ my $B = 20; my $nThreads = 1; my $genBamF = 0; +my $sampling = 0; my $calcCI = 0; my $quiet = 0; my $help = 0; @@ -83,6 +84,7 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files, "num-rspd-bins=i" => \$B, "p|num-threads=i" => \$nThreads, "out-bam" => \$genBamF, + "sampling-for-bam" => \$sampling, "calc-ci" => \$calcCI, "ci-memory=i" => \$NMB, "time" => \$mTime, @@ -111,6 +113,7 @@ pod2usage(-msg => "Min fragment length should be smaller or equal to max fragmen pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1); pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1); pod2usage(-msg => "Seed length should be at least 25!\n", -exitval => 2, -verbose => 2) if ($L < 25); +pod2usage(-msg => "--sampling-for-bam cannot be specified if --out-bam is not specified!\n", -exitval => 2, -verbose => 2) if ($sampling && !$genBamF); if ($strand_specific) { $probF = 1.0; } @@ -263,6 +266,7 @@ if ($genBamF) { $command .= " -b $samInpType $inpF"; if ($fn_list ne "") { $command .= " 1 $fn_list"; } else { $command .= " 0"; } + if ($sampling) { $command .= " --sampling"; } } if ($calcCI) { $command .= " --gibbs-out"; } if ($quiet) { $command .= " -q"; } @@ -479,6 +483,10 @@ Number of threads to use. Both Bowtie and expression estimation will use this ma Generate a BAM file, 'sample_name.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' will be generated. (Default: off) +=item B<--sampling-for-bam> + +When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled and outputed according to the posterior probabilities. If the sampling result is that the read comes from the "noise" transcript, nothing is outputed. It cannot be specified unless --out-bam is specified. (Default: off) + =item B<--calc-ci> Calculate 95% credibility intervals and posterior mean estimates. (Default: off) diff --git a/sampling.h b/sampling.h new file mode 100644 index 0000000..4171403 --- /dev/null +++ b/sampling.h @@ -0,0 +1,35 @@ +#ifndef SAMPLING +#define SAMPLING + +#include +#include +#include +#include + +#include "boost/random.hpp" + +boost::mt19937 rng(time(NULL)); +boost::uniform_01 rg(rng); + +// arr should be cumulative! +// interval : [,) +// random number should be in [0, arr[len - 1]) +// If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1 +int sample(std::vector& arr, int len) { + int l, r, mid; + double prb = rg() * arr[len - 1]; + + l = 0; r = len - 1; + while (l <= r) { + mid = (l + r) / 2; + if (arr[mid] <= prb) l = mid + 1; + else r = mid - 1; + } + + if (l >= len) { printf("%d %lf %lf\n", len, arr[len - 1], prb); } + assert(l < len); + + return l; +} + +#endif -- 2.39.2