#!/usr/bin/perl use Getopt::Long; use Pod::Usage; use strict; my $hard = 0; my $soft = 0; my $help = 0; GetOptions("hard-threshold" => \$hard, "soft-threshold" => \$soft, "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2); pod2usage(-verbose => 2) if ($help == 1); pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3); pod2usage(-msg => "--hard-threshold and --soft-threshold cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($hard && $soft); if ($hard == 0 && $soft == 0) { $hard = 1; } my $fdr = $ARGV[1]; open(INPUT, "$ARGV[0]"); open(OUTPUT, ">$ARGV[2]"); my $header = ; chomp($header); my @columns = split(/\t/, $header); my $pos = 0; while ($pos <= $#columns && $columns[$pos] ne "\"PPDE\"") { ++$pos; } if ($pos > $#columns) { print "Error: Cannot find column PPDE!\n"; exit(-1); } ++$pos; print OUTPUT "$header\n"; my ($n, $sum) = (0, 0); my $line = ""; while($line = ) { chomp($line); my @fields = split(/\t/, $line); my $ppee = 1.0 - $fields[$pos]; if ($hard) { if ($ppee > $fdr) { last; } ++$n; print OUTPUT "$line\n"; } else { if ($sum + $ppee > $fdr * ($n + 1)) { last; } ++$n; $sum += $ppee; print OUTPUT "$line\n"; } } print "There are $n genes/transcripts reported at FDR = $fdr.\n"; close(INPUT); close(OUTPUT); __END__ =head1 NAME rsem-control-fdr =head1 SYNOPSIS rsem-control-fdr [options] input_file fdr_rate output_file =head1 ARGUMENTS =over =item B This should be the result file generated by 'rsem-run-ebseq', which contains all genes/transcripts and their associated statistics. =item B The desire false discovery rate (FDR). =item B This file is a subset of the 'input_file'. It only contains the genes/transcripts called as differentially expressed (DE). When more than 2 conditions exist, DE is defined as not all conditions are equally expressed. =back =head1 OPTIONS =over =item B<--hard-threshold> Use hard threshold method to control FDR. If this option is set, only those genes/transcripts with their PPDE >= 1 - fdr_rate are called as DE. (Default: on) =item B<--soft-threshold> Use soft threshold method to control FDR. If this option is set, this program will try to report as many genes/transcripts as possible, as long as their average PPDE >= 1 - fdr_rate. (Default: off) =item B<-h/--help> Show help information. =back =head1 DESCRIPTION This program controls the false discovery rate and reports differentially expressed genes/transcripts. =head1 EXAMPLES We assume that we have 'GeneMat.results' as input. We want to control FDR at 0.05 using hard threshold method and name the output file as 'GeneMat.de.txt': rsem-control-fdr GeneMat.results 0.05 GeneMat.de.txt If we instead want to use soft threshold method to obtain more called genes/transcripts, we can use: rsem-control-fdr --soft-threshold GeneMat.results 0.05 GeneMat.de.txt =cut