#!/usr/bin/perl
use Getopt::Long;
use Pod::Usage;
use strict;
my $hard = 0;
my $soft = 0;
my $help = 0;
GetOptions("hard-threshold" => \$hard,
"soft-threshold" => \$soft,
"h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
pod2usage(-verbose => 2) if ($help == 1);
pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
pod2usage(-msg => "--hard-threshold and --soft-threshold cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($hard && $soft);
if ($hard == 0 && $soft == 0) { $hard = 1; }
my $fdr = $ARGV[1];
open(INPUT, "$ARGV[0]");
open(OUTPUT, ">$ARGV[2]");
my $header = ;
chomp($header);
my @columns = split(/\t/, $header);
my $pos = 0;
while ($pos <= $#columns && $columns[$pos] ne "\"PPDE\"") { ++$pos; }
if ($pos > $#columns) { print "Error: Cannot find column PPDE!\n"; exit(-1); }
++$pos;
print OUTPUT "$header\n";
my ($n, $sum) = (0, 0);
my $line = "";
while($line = ) {
chomp($line);
my @fields = split(/\t/, $line);
my $ppee = 1.0 - $fields[$pos];
if ($hard) {
if ($ppee > $fdr) { last; }
++$n;
print OUTPUT "$line\n";
}
else {
if ($sum + $ppee > $fdr * ($n + 1)) { last; }
++$n;
$sum += $ppee;
print OUTPUT "$line\n";
}
}
print "There are $n genes/transcripts reported at FDR = $fdr.\n";
close(INPUT);
close(OUTPUT);
__END__
=head1 NAME
rsem-control-fdr
=head1 SYNOPSIS
rsem-control-fdr [options] input_file fdr_rate output_file
=head1 ARGUMENTS
=over
=item B
This should be the result file generated by 'rsem-run-ebseq', which contains all genes/transcripts and their associated statistics.
=item B
The desire false discovery rate (FDR).
=item B
This file is a subset of the 'input_file'. It only contains the genes/transcripts called as differentially expressed (DE). When more than 2 conditions exist, DE is defined as not all conditions are equally expressed.
=back
=head1 OPTIONS
=over
=item B<--hard-threshold>
Use hard threshold method to control FDR. If this option is set, only those genes/transcripts with their PPDE >= 1 - fdr_rate are called as DE. (Default: on)
=item B<--soft-threshold>
Use soft threshold method to control FDR. If this option is set, this program will try to report as many genes/transcripts as possible, as long as their average PPDE >= 1 - fdr_rate. (Default: off)
=item B<-h/--help>
Show help information.
=back
=head1 DESCRIPTION
This program controls the false discovery rate and reports differentially expressed genes/transcripts.
=head1 EXAMPLES
We assume that we have 'GeneMat.results' as input. We want to control FDR at 0.05 using hard threshold method and name the output file as 'GeneMat.de.txt':
rsem-control-fdr GeneMat.results 0.05 GeneMat.de.txt
If we instead want to use soft threshold method to obtain more called genes/transcripts, we can use:
rsem-control-fdr --soft-threshold GeneMat.results 0.05 GeneMat.de.txt
=cut