rsem-tbam2gbam
rsem-sam-validator
rsem-scan-for-paired-end-reads
+rsem-for-ebseq-calculate-clustering-info
sam/samtools
.project
.cproject
++cnt;
isPaired = (b->core.flag & 0x0001) > 0;
if (isPaired) {
- assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001) && b->core.tid == b2->core.tid);
- assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing
- ++cnt;
+ assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001));
+ assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
+ assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080) || (b->core.flag & 0x0080) && (b2->core.flag & 0x0040));
+ ++cnt;
}
if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
// at least one segment is not properly mapped
- if ((b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004))) continue;
+ bool notgood = (b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004));
+
- const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
+ if (isPaired && notgood) assert((b->core.flag & 0x0004) && (b2->core.flag & 0x0004));
- convert(b, transcript);
- if (isPaired) {
- convert(b2, transcript);
- b->core.mpos = b2->core.pos;
- b2->core.mpos = b->core.pos;
+
+ if (!notgood) {
+ assert((b->core.tid == b2->core.tid) && (b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing
+
+ const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
+
+ convert(b, transcript);
+ if (isPaired) {
+ convert(b2, transcript);
+ b->core.mpos = b2->core.pos;
+ b2->core.mpos = b->core.pos;
+ }
+
+ if (cqname != bam1_qname(b)) {
+ writeCollapsedLines();
+ cqname = bam1_qname(b);
+ collapseMap.init(isPaired);
+ }
+ collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW")));
}
+ else {
+ assert(cqname != bam1_qname(b));
+
+ writeCollapsedLines();
+ cqname = bam1_qname(b);
+ collapseMap.init(isPaired);
- if (cqname != bam1_qname(b)) {
- writeCollapsedLines();
- cqname = bam1_qname(b);
- collapseMap.init(isPaired);
+ samwrite(out, b);
+ if (isPaired) samwrite(out, b2);
}
- collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW")));
}
writeCollapsedLines();
while (collapseMap.next(tmp_b, tmp_b2, prb)) {
memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
tmp_b->core.qual = getMAPQ(prb);
- if (tmp_b->core.qual > 0) {
- samwrite(out, tmp_b);
- if (isPaired) {
- memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
- tmp_b2->core.qual = tmp_b->core.qual;
- samwrite(out, tmp_b2);
- }
+ samwrite(out, tmp_b);
+ if (isPaired) {
+ memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+ tmp_b2->core.qual = tmp_b->core.qual;
+ samwrite(out, tmp_b2);
}
bam_destroy1(tmp_b);
if (isPaired) bam_destroy1(tmp_b2);
-
}
}
}
while (samread(in, b) >= 0) {
++cnt;
- if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; }
+ if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
- if (b->core.flag & 0x0004) continue;
+ if (!(b->core.flag & 0x0004)) {
+ hit = wrapper.getNextHit();
+ assert(hit != NULL);
- hit = wrapper.getNextHit();
- assert(hit != NULL);
+ assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+ convert(b, hit->getConPrb());
+ }
- assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
- convert(b, hit->getConPrb());
- if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
+ //if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
+ samwrite(out, b);
}
assert(wrapper.getNextHit() == NULL);
while (samread(in, b) >= 0 && samread(in, b2) >= 0) {
cnt += 2;
if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; }
- //mate info is not complete, skip
- if (!(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)))) continue;
- //unalignable reads, skip
- if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue;
-
- //swap if b is mate 2
- if (b->core.flag & 0x0080) {
- assert(b2->core.flag & 0x0040);
- bam1_t *tmp = b;
- b = b2; b2 = tmp;
- }
- hit = wrapper.getNextHit();
- assert(hit != NULL);
+ assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
+ assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080) || (b->core.flag & 0x0080) && (b2->core.flag & 0x0040));
+
+ //unalignable reads, skip
+ bool notgood = (b->core.flag & 0x0004) || (b2->core.flag & 0x0004);
- assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
- assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
+ if (!notgood) {
+ //swap if b is mate 2
+ if (b->core.flag & 0x0080) {
+ assert(b2->core.flag & 0x0040);
+ bam1_t *tmp = b;
+ b = b2; b2 = tmp;
+ }
- convert(b, hit->getConPrb());
- convert(b2, hit->getConPrb());
+ hit = wrapper.getNextHit();
+ assert(hit != NULL);
- b->core.mpos = b2->core.pos;
- b2->core.mpos = b->core.pos;
+ assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+ assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
- if (b->core.qual > 0) {
- samwrite(out, b);
- samwrite(out, b2);
+ convert(b, hit->getConPrb());
+ convert(b2, hit->getConPrb());
+
+ b->core.mpos = b2->core.pos;
+ b2->core.mpos = b->core.pos;
+ }
+ else {
+ // if only one mate can be aligned, mask it as unaligned and put an additional tag Z0:A:!
+ char exclamation = '!';
+ if (!(b->core.flag & 0x0004)) {
+ b->core.flag |= 0x0004;
+ bam_aux_append(b, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation);
+ }
+ if (!(b2->core.flag & 0x0004)) {
+ b2->core.flag |= 0x0004;
+ bam_aux_append(b2, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation);
+ }
}
+
+
+ samwrite(out, b);
+ samwrite(out, b2);
}
assert(wrapper.getNextHit() == NULL);
ifstream fin;
bool quiet = false;
- if (argc < 5) {
- printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
+ if (argc < 6) {
+ printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
printf(" refName: reference name\n");
printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
printf(" sampleName: sample's name, including the path\n");
strcpy(refName, argv[1]);
read_type = atoi(argv[2]);
strcpy(outName, argv[3]);
- sprintf(imdName, "%s.temp/%s", argv[3], argv[4]);
- sprintf(statName, "%s.stat/%s", argv[3], argv[4]);
+ strcpy(imdName, argv[4]);
+ strcpy(statName, argv[5]);
nThreads = 1;
genGibbsOut = false;
pt_fn_list = pt_chr_list = NULL;
- for (int i = 5; i < argc; i++) {
+ for (int i = 6; i < argc; i++) {
if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
if (!strcmp(argv[i], "-b")) {
genBamF = true;
int main(int argc, char* argv[]) {
if (argc < 7) {
- printf("Usage: rsem-run-gibbs-multi reference_name sample_name sampleToken BURNIN NSAMPLES GAP [-p #Threads] [--var] [-q]\n");
+ printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--var] [-q]\n");
exit(-1);
}
+ strcpy(imdName, argv[2]);
+ strcpy(statName, argv[3]);
+
BURNIN = atoi(argv[4]);
NSAMPLES = atoi(argv[5]);
GAP = atoi(argv[6]);
- sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
- sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
+
load_data(argv[1], statName, imdName);
nThreads = 1;
For more information about EBSeq (including the paper describing their
method), please visit <a
-href="http://www.biostat.wisc.edu/~ningleng/EBSeq_Package">here</a>. You
-can also find a local version of vignette under
+href="http://www.biostat.wisc.edu/~ningleng/EBSeq_Package">EBSeq
+website</a>. You can also find a local version of vignette under
'EBSeq/inst/doc/EBSeq_Vignette.pdf'.
EBSeq requires gene-isoform relationship for its isoform DE
int main(int argc, char* argv[]) {
if (argc < 8) {
- printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name sampleToken confidence nCV nSpC nMB [-p #Threads] [-q]\n");
+ printf("Usage: rsem-calculate-credibility-intervals reference_name imdName statName confidence nCV nSpC nMB [-p #Threads] [-q]\n");
exit(-1);
}
+ strcpy(imdName, argv[2]);
+ strcpy(statName, argv[3]);
+
confidence = atof(argv[4]);
nCV = atoi(argv[5]);
nSpC = atoi(argv[6]);
cvlen = M + 1;
assert(nSamples > 0 && cvlen > 1); // for Buffter.h: (bufsize_type)nSamples
- sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
- sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
sprintf(tmpF, "%s.tmp", imdName);
sprintf(cvsF, "%s.countvectors", imdName);
READ_INT_TYPE nUnique, nMulti, nIsoMulti;
char fn_list[STRLEN];
char groupF[STRLEN], tiF[STRLEN];
-char imdName[STRLEN];
char datF[STRLEN], cntF[STRLEN];
GroupInfo gi;
bool quiet = false;
if (argc < 6) {
- printf("Usage : rsem-parse-alignments refName sampleName sampleToken alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
+ printf("Usage : rsem-parse-alignments refName imdName statName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
exit(-1);
}
sprintf(tiF, "%s.ti", argv[1]);
transcripts.readFrom(tiF);
- sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
- sprintf(datF, "%s.dat", imdName);
- sprintf(cntF, "%s.stat/%s.cnt", argv[2], argv[3]);
+ sprintf(datF, "%s.dat", argv[2]);
+ sprintf(cntF, "%s.cnt", argv[3]);
- init(imdName, argv[4][0], argv[5]);
+ init(argv[2], argv[4][0], argv[5]);
hit_out.open(datF);
my $mTime = 0;
my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
+my $mate1_list = "";
+my $mate2_list = "";
+my $inpF = "";
+
+my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ();
+my $gap = 32;
+
GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
+ "temporary-folder=s" => \$temp_dir,
"no-qualities" => \$no_qual,
"paired-end" => \$paired_end,
"strand-specific" => \$strand_specific,
if ($strand_specific) { $probF = 1.0; }
-my $mate1_list = "";
-my $mate2_list = "";
-my $inpF = "";
-
-my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName) = ();
-my $gap = 32;
-
if ($paired_end) {
if ($no_qual) { $read_type = 2; }
else { $read_type = 3; }
if ($pos < 0) { $sampleToken = $sampleName; }
else { $sampleToken = substr($sampleName, $pos + 1); }
-$temp_dir = "$sampleName.temp";
+if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
$stat_dir = "$sampleName.stat";
if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
$imdName = "$temp_dir/$sampleToken";
+$statName = "$stat_dir/$sampleToken";
if (!$is_sam && !$is_bam && !$no_qual && ($phred33 + $phred64 + $solexa == 0)) { $phred33 = 1; }
$command .= " -1 $mate1_list -2 $mate2_list";
}
- $command .= " | gzip > $sampleName.sam.gz";
+ # pipe to samtools to generate a BAM file
+ $command .= " | $dir\sam/samtools view -S -b -o $imdName.bam -";
if ($mTime) { $time_start = time(); }
if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
- $inpF = "$sampleName.sam.gz";
- $is_sam = 1; # output of bowtie is a sam file
+ $inpF = "$imdName.bam";
+ $is_bam = 1; # alignments are outputed as a BAM file
}
if ($mTime) { $time_start = time(); }
-$command = $dir."rsem-parse-alignments $refName $sampleName $sampleToken";
+$command = $dir."rsem-parse-alignments $refName $imdName $statName";
my $samInpType;
if ($is_sam) { $samInpType = "s"; }
print OUTPUT "$L\n";
close(OUTPUT);
-$command = $dir."rsem-run-em $refName $read_type $sampleName $sampleToken -p $nThreads";
+$command = $dir."rsem-run-em $refName $read_type $sampleName $imdName $statName -p $nThreads";
if ($genBamF) {
$command .= " -b $samInpType $inpF";
if ($fn_list ne "") { $command .= " 1 $fn_list"; }
&runCommand($command);
+&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+
if ($genBamF) {
$command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
&runCommand($command);
}
}
-&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
-
if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
if ($mTime) { $time_start = time(); }
if ($calcCI || $var_opt) {
- $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $NCV $SAMPLEGAP";
+ $command = $dir."rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
$command .= " -p $nThreads";
if ($var_opt) { $command .= " --var"; }
if ($quiet) { $command .= " -q"; }
Keep temporary files generated by RSEM. RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted. Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
+=item B<--temporary-folder> <string>
+
+Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
+
=item B<--time>
Output time consumed by each step of RSEM to 'sample_name.time'. (Default: off)