in = samopen(inpF, "rb", NULL);
assert(in != 0);
+ transcripts.buildMappings(in->header->n_targets, in->header->target_name);
+
bam_header_t *out_header = sam_header_read2(chr_list);
refmap.clear();
for (int i = 0; i < out_header->n_targets; i++) {
// at least one segment is not properly mapped
if ((b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004))) continue;
- const Transcript& transcript = transcripts.getTranscriptAt(b->core.tid + 1);
+ const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
convert(b, transcript);
if (isPaired) {
}
assert(in != 0);
+ //build mappings from external sid to internal sid
+ transcripts.buildMappings(in->header->n_targets, in->header->target_name);
+
//generate output's header
bam_header_t *out_header = bam_header_dwt(in->header);
-
- if (out_header->n_targets != transcripts.getM()) {
- fprintf(stderr, "Number of reference sequences recorded in the header is not correct! The header contains %d sequences while there should be %d sequences\n", out_header->n_targets, transcripts.getM());
- exit(-1);
- }
-
for (int i = 0; i < out_header->n_targets; i++) {
- const Transcript& transcript = transcripts.getTranscriptAt(i + 1);
- if (out_header->target_name[i] != transcript.getTranscriptID()) {
- fprintf(stderr, "Reference sequence %d's name recorded in the header is not correct! \n", i);
- fprintf(stderr, "Name in the header: %s\n", out_header->target_name[i]);
- fprintf(stderr, "Should be: %s\n", transcript.getTranscriptID().c_str());
- exit(-1);
- }
- out_header->target_len[i] = transcript.getLength(); // transcript length without poly(A) tail
+ out_header->target_len[i] = transcripts.getTranscriptViaEid(i + 1).getLength(); // transcript length without poly(A) tail
}
std::ostringstream strout;
hit = wrapper.getNextHit();
assert(hit != NULL);
- assert(b->core.tid + 1 == hit->getSid());
+ assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
convert(b, hit->getConPrb());
if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
}
hit = wrapper.getNextHit();
assert(hit != NULL);
- assert(b->core.tid + 1 == hit->getSid());
- assert(b2->core.tid + 1 == hit->getSid());
+ assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+ assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
convert(b, hit->getConPrb());
convert(b2, hit->getConPrb());
}
void BamWriter::convert(bam1_t *b, double prb) {
- int sid = b->core.tid + 1;
- const Transcript& transcript = transcripts.getTranscriptAt(sid);
+ const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
int pos = b->core.pos;
int readlen = b->core.l_qseq;
std::streampos gap2 = std::streampos(nSamples - to) * FLOATSIZE;
float *p = NULL;
- ftmpOut.seekp(0, std::ios_base::beg);
+ ftmpOut.seekp(0, std::ios::beg);
for (int i = 0; i < cvlen; i++) {
p = buffer + i;
- ftmpOut.seekp(gap1, std::ios_base::cur);
+ ftmpOut.seekp(gap1, std::ios::cur);
for (int j = fr; j < to; j++) {
ftmpOut.write((char*)p, FLOATSIZE);
p += cvlen;
}
- ftmpOut.seekp(gap2, std::ios_base::cur);
+ ftmpOut.seekp(gap2, std::ios::cur);
}
cpos = 0;
time_t b = time(NULL);
- printTimeUsed(a, b);
+ printTimeUsed(a, b, "EM.cpp");
return 0;
}
alternative aligner, you may also want to provide the '--no-bowtie' option
to 'rsem-prepare-reference' so that the Bowtie indices are not built.
-Some aligners' (other than Bowtie) output might need to be converted
-so that RSEM can use. For conversion, please run
+RSEM requires all alignments of the same read group together. For
+paired-end reads, RSEM also requires the two mates of any alignment be
+adjacent. If the alternative aligner does not satisfy the first
+requirement, you can use 'convert-sam-for-rsem' for conversion. Please run
convert-sam-for-rsem --help
#include "sam/sam.h"
#include "utils.h"
-
-#include "RefSeq.h"
-#include "Refs.h"
+#include "my_assert.h"
#include "SingleRead.h"
#include "SingleReadQ.h"
#include "SingleHit.h"
#include "PairedEndHit.h"
+#include "Transcripts.h"
+
class SamParser {
public:
- SamParser(char, const char*, Refs&, const char* = 0);
+ SamParser(char, const char*, Transcripts&, const char* = 0);
~SamParser();
/**
bam_header_t *header;
bam1_t *b, *b2;
+ Transcripts& transcripts;
//tag used by aligner
static char rtTag[STRLEN];
char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
// aux, if not 0, points to the file name of fn_list
-SamParser::SamParser(char inpType, const char* inpF, Refs& refs, const char* aux) {
+SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux)
+ : transcripts(transcripts)
+{
switch(inpType) {
case 'b': sam_in = samopen(inpF, "rb", aux); break;
case 's': sam_in = samopen(inpF, "r", aux); break;
default: assert(false);
}
- if (sam_in == 0) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
- header = sam_in->header;
- if (header == 0) { fprintf(stderr, "Fail to parse sam header!\n"); exit(-1); }
-
- // Check if the reference used for aligner is the transcript set RSEM generated
- if (refs.getM() != header->n_targets) {
- fprintf(stderr, "Number of transcripts does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
- exit(-1);
- }
- for (int i = 0; i < header->n_targets; i++) {
- const RefSeq& refseq = refs.getRef(i + 1);
- // If update int to long, chance the (int) conversion
- if (refseq.getName().compare(header->target_name[i]) != 0 || refseq.getTotLen() != (int)header->target_len[i]) {
- fprintf(stderr, "Transcript information does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
- exit(-1);
- }
- }
-
- b = bam_init1();
- b2 = bam_init1();
+ general_assert(sam_in != 0, "Cannot open " + cstrtos(inpF) + "! It may not exist.");
+ header = sam_in->header;
+ general_assert(header != 0, "Fail to parse sam header!");
+
+ transcripts.buildMappings(header->n_targets, header->target_name);
+
+ b = bam_init1();
+ b2 = bam_init1();
}
SamParser::~SamParser() {
if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
if (getDir(b) > 0) {
- hit = SingleHit(b->core.tid + 1, b->core.pos);
+ hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos);
}
else {
- hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
+ hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
}
}
if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
if (getDir(b) > 0) {
- hit = SingleHit(b->core.tid + 1, b->core.pos);
+ hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos);
}
else {
- hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
+ hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
}
}
}
//assert(mp1->core.tid == mp2->core.tid);
if (getDir(mp1) > 0) {
- hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
+ hit = PairedEndHit(transcripts.getInternalSid(mp1->core.tid + 1), mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
}
else {
- hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
+ hit = PairedEndHit(-transcripts.getInternalSid(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
}
}
}
//assert(mp1->core.tid == mp2->core.tid);
if (getDir(mp1) > 0) {
- hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
+ hit = PairedEndHit(transcripts.getInternalSid(mp1->core.tid + 1), mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
}
else {
- hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
+ hit = PairedEndHit(-transcripts.getInternalSid(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
}
}
#include<fstream>
#include<vector>
#include<algorithm>
+#include<map>
+#include<string>
+#include "my_assert.h"
#include "Transcript.h"
M = 0; this->type = type;
transcripts.clear();
transcripts.push_back(Transcript());
+
+ e2i.clear(); i2e.clear();
}
int getM() { return M; }
void readFrom(const char*);
void writeTo(const char*);
+ //Eid: external sid
+ int getInternalSid(int eid) {
+ assert(eid > 0 && eid <= M);
+ return e2i[eid];
+ }
+
+ const Transcript& getTranscriptViaEid(int eid) {
+ return transcripts[getInternalSid(eid)];
+ }
+
+ void buildMappings(int, char**);
+
private:
int M, type; // type 0 from genome , 1 standalone transcriptome
std::vector<Transcript> transcripts;
+
+ std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
};
void Transcripts::readFrom(const char* inpF) {
fin>>M>>type;
getline(fin, line);
- transcripts.clear();
- transcripts.resize(M + 1);
+ transcripts.assign(M + 1, Transcript());
for (int i = 1; i <= M; i++) {
transcripts[i].read(fin);
}
fout.close();
}
+void Transcripts::buildMappings(int n_targets, char** target_name) {
+ std::map<std::string, int> dict;
+ std::map<std::string, int>::iterator iter;
+
+ general_assert(n_targets == M, "Number of transcripts does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+
+ dict.clear();
+ for (int i = 1; i <= M; i++) {
+ const std::string& tid = transcripts[i].getTranscriptID();
+ iter = dict.find(tid);
+ assert(iter == dict.end());
+ dict[tid] = i;
+ }
+
+ e2i.assign(M + 1, 0);
+ i2e.assign(M + 1, 0);
+ for (int i = 0; i < n_targets; i++) {
+ iter = dict.find(std::string(target_name[i]));
+ general_assert(iter != dict.end(), "RSEM can not recognize transcript " + cstrtos(target_name[i]) + "!");
+ general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
+ e2i[i + 1] = iter->second;
+ i2e[iter->second] = i + 1;
+ iter->second = -1;
+ }
+}
+
#endif /* TRANSCRIPTS_H_ */
+RSEM v1.1.17
+
+- Fixed a bug related to parallezation of credibility intervals calculation
+- Added --no-bam-output option to rsem-calculate-expression
+- The order of @SQ tags in SAM/BAM files can be arbitrary now
+
+--------------------------------------------------------------------------------------------
+
RSEM v1.1.16
- Added --time option to show time consumed by each phase
use Pod::Usage;
use strict;
-my $standard_output = "&STDOUT";
-
-my $out_file = $standard_output;
+my $out_file = "";
my @tmp_dirs = ();
my $help = 0;
pod2usage(-verbose => 2) if ($help == 1);
-pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 1);
my $command;
-my (@fields, @header) = ();
-my $M;
-
-# Load fields
-my @lines = ();
-my $type;
-
-open(INPUT, "$ARGV[0].ti");
-@lines = <INPUT>;
-chomp(@lines);
-close(INPUT);
-
-@fields = ();
-($M, $type) = split(/ /, $lines[0]);
-for (my $i = 0; $i < $M; $i++) {
- push(@fields, "SN:$lines[$i * 6 + 1]");
-}
-
-
-# Reorder header
-my $line;
-
-open(INPUT, $ARGV[1]);
-@header = ();
-while (($line = <INPUT>) && substr($line, 0, 1) eq '@') {
- chomp($line);
- push(@header, $line);
-}
-close(INPUT);
-
-my $n = scalar(@header);
-if ($n > 0) {
- my %hash = ();
- my @ktable = ();
-
- my $tid = 0;
-
- for (my $i = 0; $i < $n; $i++) {
- my @arr = split(/\t/, $header[$i]);
- if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; }
- my $hasSN = 0;
- foreach my $key (@arr) {
- if (substr($key, 0, 3) eq "SN:") {
- $hash{$key} = $i;
- $hasSN = 1;
- last;
- }
- }
- if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); }
- push(@ktable, $fields[$tid++]);
- }
-
- if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); }
- open(OUTPUT, ">$out_file");
- for (my $i = 0; $i < $n; $i++) {
- if ($ktable[$i] eq "") { print OUTPUT $header[$i]; }
- else { print OUTPUT $header[$hash{$ktable[$i]}]; }
- print OUTPUT "\n";
- }
- close(OUTPUT);
-}
-
-
-# extract alignment section
-$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp";
+# grep header section
+$command = "grep ^@ $ARGV[0]";
+if ($out_file ne "") { $command .= " > $out_file"; }
&runCommand($command);
-# sort and output the alignment section
-$command = "sort -k 1,1 -s";
+# sort alignment section
+$command = "grep ^[^@] $ARGV[0] | sort -k 1,1 -s";
if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; }
-$command .= " $ARGV[1].__temp";
-if ($out_file ne $standard_output) { $command .= " >> $out_file"; }
-&runCommand($command);
-
-# delete temporary files
-$command = "rm -f $ARGV[1].__temp";
+if ($out_file ne "") { $command .= " >> $out_file"; }
&runCommand($command);
# finish
=over
- convert-sam-for-rsem [options] reference_name input_sam
+ convert-sam-for-rsem [options] input_sam
=back
=over
-=item B<reference_name>
-
-The name of the reference used. This should be the same name used by 'rsem-prepare-reference'.
-
=item B<input_sam>
The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information).
This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk.
-Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the order of @SQ lines is the same as 'reference_name.idx.fa' and the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads.
+Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads.
Note: This program can only recognize SAM files. See ARGUMENTS section.
=head1 EXAMPLES
-Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'.
+Suppose input_sam is set to 'input.sam'.
1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz':
- convert-sam-for-rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz
+ convert-sam-for-rsem input.sam | gzip > input_for_rsem.sam.gz
2) Output to 'input_for_rsem.sam' directly:
- convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam
+ convert-sam-for-rsem input.sam -o input_for_rsem.sam
=cut
Transcript.h : utils.h
-Transcripts.h : Transcript.h
+Transcripts.h : my_assert.h Transcript.h
rsem-extract-reference-transcripts : utils.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp
$(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts
HitContainer.h : GroupInfo.h
-SamParser.h : sam/sam.h sam/bam.h utils.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h RefSeq.h Refs.h
+SamParser.h : sam/sam.h sam/bam.h utils.h my_assert.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcripts.h
rsem-parse-alignments : parseIt.o sam/libbam.a
$(CC) -o rsem-parse-alignments parseIt.o sam/libbam.a -lz
-parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h RefSeq.h Refs.h sam/sam.h sam/bam.h parseIt.cpp
+parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h Transcripts.h sam/sam.h sam/bam.h parseIt.cpp
$(CC) $(COFLAGS) parseIt.cpp
#include "utils.h"
#include "GroupInfo.h"
-
-#include "RefSeq.h"
-#include "Refs.h"
+#include "Transcripts.h"
#include "SingleRead.h"
#include "SingleReadQ.h"
int nHits; // # of hits
int nUnique, nMulti, nIsoMulti;
char fn_list[STRLEN];
-char refF[STRLEN], groupF[STRLEN];
+char groupF[STRLEN], tiF[STRLEN];
char imdName[STRLEN];
char datF[STRLEN], cntF[STRLEN];
-Refs refs;
GroupInfo gi;
+Transcripts transcripts;
SamParser *parser;
ofstream hit_out;
char* aux = 0;
if (strcmp(fn_list, "")) aux = fn_list;
- parser = new SamParser(alignFType, alignF, refs, aux);
+ parser = new SamParser(alignFType, alignF, transcripts, aux);
memset(cat, 0, sizeof(cat));
memset(readOutFs, 0, sizeof(readOutFs));
verbose = !quiet;
- sprintf(refF, "%s.seq", argv[1]);
- refs.loadRefs(refF, 1);
sprintf(groupF, "%s.grp", argv[1]);
gi.load(groupF);
+ sprintf(tiF, "%s.ti", argv[1]);
+ transcripts.readFrom(tiF);
sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
sprintf(datF, "%s.dat", imdName);
return (fr <= to ? str.substr(fr, to - fr + 1) : "");
}
-void printTimeUsed(const time_t& a, const time_t& b, const char* filename = "") {
- int hh = (b - a) / 3600;
- int mm = (b - a) % 3600 / 60;
- int ss = (b - a) % 60;
-
- printf("Time Used : %d h %02d m %02d s\n", hh, mm, ss);
-
- if (strcmp(filename, "")) {
- FILE *fo = fopen(filename, "w");
- fprintf(fo, "Time Used : %d h %02d m %02d s\n", hh, mm, ss);
- fclose(fo);
- }
-}
-
void genReadFileNames(const char* readFN, int tagType, int read_type, int& s, char readFs[][STRLEN]){
const char tags[3][STRLEN] = {"un", "alignable", "max"};
char suffix[STRLEN];
}
}
+void printTimeUsed(const time_t& a, const time_t& b, const char* program_name) {
+ int hh = (b - a) / 3600;
+ int mm = (b - a) % 3600 / 60;
+ int ss = (b - a) % 60;
+
+ printf("Time Used for %s : %d h %02d m %02d s\n", program_name, hh, mm, ss);
+}
+
#endif