- Modified 'rsem-tbam2gbam' to convert users' alignments from transcript BAM files into genome BAM files, provided users use 'reference_name.idx.fa' to build indices for their aligners
- Updated EBSeq from v1.1.3 to v1.1.4
- Corrected several typos in warning messages
.project
.cproject
update_doc.sh
+EBSeq/EBSeq
+EBSeq/blockmodeling
transcripts.buildMappings(in->header->n_targets, in->header->target_name);
bam_header_t *out_header = sam_header_read2(chr_list);
+
refmap.clear();
for (int i = 0; i < out_header->n_targets; i++) {
refmap[out_header->target_name[i]] = i;
}
- append_header_text(out_header, in->header->text, in->header->l_text);
+ if (in->header->l_text > 0) {
+ char comment[] = "@CO\tThis BAM file is processed by rsem-tbam2gam to convert from transcript coordinates into genomic coordinates.\n";
+ int comment_len = strlen(comment);
+
+ //Filter @SQ fields if the BAM file is user provided
+ char *text = in->header->text;
+ int l_text = in->header->l_text;
+ char *new_text = new char[l_text + comment_len];
+ int pos = 0, s = 0;
+ while (pos < l_text) {
+ if ((pos + 2 < l_text) && (text[pos] == '@') && (text[pos + 1] == 'S') && (text[pos + 2] == 'Q')) {
+ pos += 3;
+ while (pos < l_text && text[pos] != '\n') ++pos;
+ }
+ else new_text[s++] = text[pos];
+ ++pos;
+ }
+ strncpy(new_text + s, comment, comment_len);
+ s += comment_len;
+
+ append_header_text(out_header, new_text, s);
+ delete[] new_text;
+ }
out = samopen(outF, "wb", out_header);
assert(out != 0);
if (!notgood) {
- if (isPaired) assert((b->core.tid == b2->core.tid) && (b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing
-
- const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
-
- convert(b, transcript);
- if (isPaired) {
- convert(b2, transcript);
- b->core.mpos = b2->core.pos;
- b2->core.mpos = b->core.pos;
- }
-
- if (cqname != bam1_qname(b)) {
- writeCollapsedLines();
- cqname = bam1_qname(b);
- collapseMap.init(isPaired);
- }
- collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW")));
+ // for collapsing
+ if (isPaired) {
+ assert(b->core.tid == b2->core.tid);
+ if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) {
+ bam1_t *tmp = b; b = b2; b2 = tmp;
+ }
+ }
+
+ const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
+
+ convert(b, transcript);
+ if (isPaired) {
+ convert(b2, transcript);
+ b->core.mpos = b2->core.pos;
+ b2->core.mpos = b->core.pos;
+ }
+
+ if (cqname != bam1_qname(b)) {
+ writeCollapsedLines();
+ cqname = bam1_qname(b);
+ collapseMap.init(isPaired);
+ }
+
+ uint8_t *p = bam_aux_get(b, "ZW");
+ float prb = (p != NULL? bam_aux2f(p) : 1.0);
+ collapseMap.insert(b, b2, prb);
}
else {
- assert(cqname != bam1_qname(b));
+ assert(cqname != bam1_qname(b));
- writeCollapsedLines();
- cqname = bam1_qname(b);
- collapseMap.init(isPaired);
+ writeCollapsedLines();
+ cqname = bam1_qname(b);
+ collapseMap.init(isPaired);
- samwrite(out, b);
- if (isPaired) samwrite(out, b2);
+ samwrite(out, b);
+ if (isPaired) samwrite(out, b2);
}
-
}
writeCollapsedLines();
bam1_t *tmp_b = NULL,*tmp_b2 = NULL;
float prb;
bool isPaired;
+ uint8_t *p;
if (!collapseMap.empty(isPaired)) {
while (collapseMap.next(tmp_b, tmp_b2, prb)) {
- memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
- tmp_b->core.qual = getMAPQ(prb);
+ p = bam_aux_get(tmp_b, "ZW");
+ if (p != NULL) {
+ memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+ tmp_b->core.qual = getMAPQ(prb);
+ }
+ else tmp_b->core.qual = getMAPQ(1.0);
samwrite(out, tmp_b);
if (isPaired) {
- memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
- tmp_b2->core.qual = tmp_b->core.qual;
- samwrite(out, tmp_b2);
+ if (p != NULL) memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+ tmp_b2->core.qual = tmp_b->core.qual;
+ samwrite(out, tmp_b2);
}
bam_destroy1(tmp_b);
if (isPaired) bam_destroy1(tmp_b2);
//generate output's header
bam_header_t *out_header = bam_header_dwt(in->header);
- for (int i = 0; i < out_header->n_targets; i++) {
- out_header->target_len[i] = transcripts.getTranscriptViaEid(i + 1).getLength(); // transcript length without poly(A) tail
- }
std::ostringstream strout;
strout<<"@HD\tVN:1.4\tSO:unknown\n@PG\tID:RSEM\n";
while (samread(in, b) >= 0 && samread(in, b2) >= 0) {
cnt += 2;
- if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; }
+ if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
assert(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)));
convert(b, hit->getConPrb());
convert(b2, hit->getConPrb());
- }
- /*
- else {
- // if only one mate can be aligned, mask it as unaligned and put an additional tag Z0:A:!
- char exclamation = '!';
- if (!(b->core.flag & 0x0004)) {
- b->core.flag |= 0x0004;
- bam_aux_append(b, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation);
- }
- if (!(b2->core.flag & 0x0004)) {
- b2->core.flag |= 0x0004;
- bam_aux_append(b2, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation);
- }
+ // omit "b->core.mpos = b2->core.pos; b2->core.mpos = b->core.pos" for the reason that it is possible that only one mate is alignable
}
- */
samwrite(out, b);
samwrite(out, b2);
frac[i] = theta[i];
denom += frac[i];
}
- general_assert(denom > 0, "No alignable reads?!");
+ general_assert(denom >= EPSILON, "No alignable reads?!");
for (int i = 1; i <= M; i++) frac[i] /= denom;
//calculate FPKM
}
if (gene_tpm[i] < EPSILON) {
- double frac = 1.0 / (e - b);
- for (int j = b; j < e; j++) {
- glens[i] += tlens[j] * frac;
- gene_eels[i] += eel[j] * frac;
- }
+ double frac = 1.0 / (e - b);
+ for (int j = b; j < e; j++) {
+ glens[i] += tlens[j] * frac;
+ gene_eels[i] += eel[j] * frac;
+ }
}
else {
- for (int j = b; j < e; j++) {
- isopct[j] = tpm[j] / gene_tpm[i];
- glens[i] += tlens[j] * isopct[j];
- gene_eels[i] += eel[j] * isopct[j];
- }
+ for (int j = b; j < e; j++) {
+ isopct[j] = tpm[j] / gene_tpm[i];
+ glens[i] += tlens[j] * isopct[j];
+ gene_eels[i] += eel[j] * isopct[j];
+ }
}
}
frac[i] = theta[i];
denom += frac[i];
}
- general_assert(denom > 0, "No alignable reads?!");
+ general_assert(denom >= EPSILON, "No alignable reads?!");
for (int i = 1; i <= M; i++) frac[i] /= denom;
//calculate FPKM
generated by the samtools included. All these files are in genomic
coordinates.
-#### a) Generating a Wiggle file
+#### a) Converting transcript BAM file into genome BAM file
+
+Normally, RSEM will do this for you via '--output-genome-bam' option
+of 'rsem-calculate-expression'. However, if you have run
+'rsem-prepare-reference' and use 'reference_name.idx.fa' to build
+indices for your aligner, you can use 'rsem-tbam2gbam' to convert your
+transcript coordinate BAM alignments file into a genomic coordinate
+BAM alignments file without the need to run the whole RSEM
+pipeline. Please note that 'rsem-prepare-reference' will convert all
+'N' into 'G' by default for 'reference_name.idx.fa'. If you do not
+want this to happen, please use '--no-ntog' option.
+
+Usage:
+
+ rsem-tbam2gbam reference_name unsorted_transcript_bam_input genome_bam_output
+
+reference_name : The name of reference built by 'rsem-prepare-reference'
+unsorted_transcript_bam_input : This file should satisfy: 1) the alignments of a same read are grouped together, 2) for any paired-end alignment, the two mates should be adjacent to each other, 3) this file should not be sorted by samtools
+genome_bam_output : The output genomic coordinate BAM file's name
+
+#### b) Generating a Wiggle file
A wiggle plot representing the expected number of reads overlapping
each position in the genome/transcript set can be generated from the
sorted_bam_input : Input BAM format file, must be sorted
wig_output : Output wiggle file's name, e.g. output.wig
wiggle_name : the name of this wiggle plot
---no-fractional-weight : If this is set, RSEM will not look for "ZW" tag and each alignment appeared in the BAM file has weight 1. Set this if your BAM file is not generated by RSEM. Please note that this option must be at the end of the command line.
+--no-fractional-weight : If this is set, RSEM will not look for "ZW" tag and each alignment appeared in the BAM file has weight 1. Set this if your BAM file is not generated by RSEM. Please note that this option must be at the end of the command line
-#### b) Loading a BAM and/or Wiggle file into the UCSC Genome Browser or Integrative Genomics Viewer(IGV)
+#### c) Loading a BAM and/or Wiggle file into the UCSC Genome Browser or Integrative Genomics Viewer(IGV)
For UCSC genome browser, please refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/help/customTrack.html).
1) Import the transcript sequences as a genome
-Select File -> Import Genome, then fill in ID, Name and Fasta file. Fasta file should be 'reference_name.idx.fa'. After that, click Save button. Suppose ID is filled as 'reference_name', a file called 'reference_name.genome' will be generated. Next time, we can use: File -> Load Genome, then select 'reference_name.genome'.
+Select File -> Import Genome, then fill in ID, Name and Fasta file. Fasta file should be 'reference_name.transcripts.fa'. After that, click Save button. Suppose ID is filled as 'reference_name', a file called 'reference_name.genome' will be generated. Next time, we can use: File -> Load Genome, then select 'reference_name.genome'.
2) Load visualization files
igvtools tile reference_name.transcript.wig reference_name.transcript.tdf reference_name.genome
-#### c) Generating Transcript Wiggle Plots
+#### d) Generating Transcript Wiggle Plots
To generate transcript wiggle plots, you should run the
'rsem-plot-transcript-wiggles' program. Run
to get usage information or visit the [rsem-plot-transcript-wiggles
documentation page](http://deweylab.biostat.wisc.edu/rsem/rsem-plot-transcript-wiggles.html).
-#### d) Visualize the model learned by RSEM
+#### e) Visualize the model learned by RSEM
RSEM provides an R script, 'rsem-plot-model', for visulazing the model learned.
+RSEM v1.2.1
+
+- Added poly(A) tails to 'reference_name.transcripts.fa' so that the RSEM generated transcript unsorted BAM file can be fed into RSEM as an input file. However, users need to rebuild their references if they want to visualize the transcript level wiggle files and BAM files using IGV
+- Modified 'rsem-tbam2gbam' to convert users' alignments from transcript BAM files into genome BAM files, provided users use 'reference_name.idx.fa' to build indices for their aligners
+- Updated EBSeq from v1.1.3 to v1.1.4
+- Corrected several typos in warning messages
+
+--------------------------------------------------------------------------------------------
+
RSEM v1.2.0
- Changed output formats, added FPKM field etc.
for (int i = 0; i < nSpC; i++) {
double sum = 0.0;
for (int j = 0; j <= M; j++) {
- theta[j] = ((j == 0 || eel[j] >= EPSILON && mw[j] >= EPSILON) ? (*rgs[j])() / mw[j] : 0.0);
+ theta[j] = ((j == 0 || (eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
sum += theta[j];
}
assert(sum >= EPSILON);
map<string, string>::iterator mi_iter; //mapping info table's iterator
void loadMappingInfo(char* mappingF) {
- ifstream fin(mappingF);
- string line, key, value;
-
- if (!fin.is_open()) {
- fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF);
- exit(-1);
- }
-
- mi_table.clear();
- while (getline(fin, line)) {
- line = cleanStr(line);
- if (line[0] == '#') continue;
- istringstream strin(line);
- strin>>value>>key;
- mi_table[key] = value;
- }
-
- fin.close();
+ ifstream fin(mappingF);
+ string line, key, value;
+
+ if (!fin.is_open()) {
+ fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF);
+ exit(-1);
+ }
+
+ mi_table.clear();
+ while (getline(fin, line)) {
+ line = cleanStr(line);
+ if (line[0] == '#') continue;
+ istringstream strin(line);
+ strin>>value>>key;
+ mi_table[key] = value;
+ }
+
+ fin.close();
}
bool buildTranscript(int sp, int ep) {
if (seqs[i] == "") {
const Transcript& transcript = transcripts.getTranscriptAt(i);
fprintf(stderr, "Cannot extract transcript %s's sequence from chromosome %s, whose information might not be provided! Please check if the chromosome directory is set correctly or the list of chromosome files is complete.\n", \
- transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str());
+ transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str());
exit(-1);
}
}
Refs refs;
ofstream fout;
-char refF[STRLEN], alignerFastaF[STRLEN];
+char refF[STRLEN], alignerFastaF[STRLEN], transF[STRLEN];
int polyAChoice, polyALen;
char exceptionF[STRLEN];
// always generate references for aligners, default convert all N into G
int main(int argc, char* argv[]) {
- if (argc < 4) {
- printf("USAGE : rsem-preref refFastaF polyAChoice refName [-l polyALen] [-f exceptionF] [--no-ntog] [-q]\n\n");
- printf(" refFastaF: a FASTA format file contains all reference transcripts\n");
- printf(" polyAChoice: choice for polyA tail padding.It is a number from {0,1,2}\n");
- printf(" 0: pad polyA tail\n");
- printf(" 1: do not pad polyA tail at all\n");
- printf(" 2: pad polyA tail for all references but those in exceptionF\n");
- printf(" -l: polyALen: specify the length of polyA tail you want to pad. Default is 100\n");
- printf(" -f: exceptionF: file contains a list of exception reference ids. IDs starts from 1. Must set if polyAChoice = 2\n");
- printf(" --no-ntog: do not convert N in references into G\n");
- printf(" -q: quiet\n");
- exit(-1);
- }
-
-
- polyAChoice = atoi(argv[2]);
-
- polyALen = 125;
- ntog = true;
- quiet = false;
- memset(exceptionF, 0, sizeof(exceptionF));
-
- for (int i = 4; i < argc; i++) {
- if (!strcmp(argv[i], "-l")) { polyALen = atoi(argv[i + 1]); }
- if (!strcmp(argv[i], "-f")) { strcpy(exceptionF, argv[i + 1]); }
- if (!strcmp(argv[i], "--no-ntog")) { ntog = false; }
- if (!strcmp(argv[i], "-q")) { quiet = true; }
- }
-
- verbose = !quiet;
-
- //make references
- rules = PolyARules(polyAChoice, polyALen, exceptionF);
- refs.makeRefs(argv[1], refp, rules);
- M = refs.getM();
-
- //save references
- sprintf(refF, "%s.seq", argv[3]);
- refs.saveRefs(refF);
-
- sprintf(alignerFastaF, "%s.idx.fa", argv[3]);
- fout.open(alignerFastaF);
- for (int i = 1; i <= M; i++) {
- fout<<">"<<refs.getRef(i).getName()<<endl<<(ntog ? aligner_refp.convert(refs.getRef(i).getSeq()) : refs.getRef(i).getSeq())<<endl;
- }
- fout.close();
-
- if (verbose) { printf("IDX.fa generated!\n"); }
-
- return 0;
+ if (argc < 4) {
+ printf("USAGE : rsem-preref refFastaF polyAChoice refName [-l polyALen] [-f exceptionF] [--no-ntog] [-q]\n\n");
+ printf(" refFastaF: a FASTA format file contains all reference transcripts\n");
+ printf(" polyAChoice: choice for polyA tail padding.It is a number from {0,1,2}\n");
+ printf(" 0: pad polyA tail\n");
+ printf(" 1: do not pad polyA tail at all\n");
+ printf(" 2: pad polyA tail for all references but those in exceptionF\n");
+ printf(" -l: polyALen: specify the length of polyA tail you want to pad. Default is 100\n");
+ printf(" -f: exceptionF: file contains a list of exception reference ids. IDs starts from 1. Must set if polyAChoice = 2\n");
+ printf(" --no-ntog: do not convert N in references into G\n");
+ printf(" -q: quiet\n");
+ printf(" This program will generate a file named \"refName.transcripts.fa\", which may rewrite an existing file (e.g. refFastaF).\n");
+ exit(-1);
+ }
+
+ polyAChoice = atoi(argv[2]);
+
+ polyALen = 125;
+ ntog = true;
+ quiet = false;
+ memset(exceptionF, 0, sizeof(exceptionF));
+
+ for (int i = 4; i < argc; i++) {
+ if (!strcmp(argv[i], "-l")) { polyALen = atoi(argv[i + 1]); }
+ if (!strcmp(argv[i], "-f")) { strcpy(exceptionF, argv[i + 1]); }
+ if (!strcmp(argv[i], "--no-ntog")) { ntog = false; }
+ if (!strcmp(argv[i], "-q")) { quiet = true; }
+ }
+
+ verbose = !quiet;
+
+ //make references
+ rules = PolyARules(polyAChoice, polyALen, exceptionF);
+ refs.makeRefs(argv[1], refp, rules);
+ M = refs.getM();
+
+ //save references
+ sprintf(refF, "%s.seq", argv[3]);
+ refs.saveRefs(refF);
+
+ sprintf(transF, "%s.transcripts.fa", argv[3]);
+ fout.open(transF);
+ for (int i = 1; i <= M; i++) {
+ fout<< ">"<< refs.getRef(i).getName()<< endl<< refs.getRef(i).getSeq()<< endl;
+ }
+ fout.close();
+ if (verbose) printf("%s is generated!\n", transF);
+
+ sprintf(alignerFastaF, "%s.idx.fa", argv[3]);
+ fout.open(alignerFastaF);
+ for (int i = 1; i <= M; i++) {
+ fout<<">"<<refs.getRef(i).getName()<<endl<<(ntog ? aligner_refp.convert(refs.getRef(i).getSeq()) : refs.getRef(i).getSeq())<<endl;
+ }
+ fout.close();
+ if (verbose) printf("%s is generated!\n", alignerFastaF);
+
+ return 0;
}
=item B<--no-polyA>
-Do not add poly(A) tails to the end of reference isoforms. (Default: add poly(A) tails to all transcripts)
+Do not add poly(A) tails to the end of reference isoforms. (Default: adding poly(A) tails to all transcripts)
=item B<--no-polyA-subset> <file>
'reference_name.grp', 'reference_name.ti', 'reference_name.seq', 'reference_name.idx.fa', and 'reference_name.chrlist' are used by RSEM internally.
-B<'reference_name.transcripts.fa'> contains the extracted reference transcripts in FASTA format. Poly(A) tails are not added.
+B<'reference_name.transcripts.fa'> contains the extracted reference transcripts in FASTA format. Poly(A) tails are added unless '--no-polyA' is set.
=head1 EXAMPLES