]> git.donarmstrong.com Git - bin.git/commitdiff
add genotypeizer
authorDon Armstrong <don@donarmstrong.com>
Wed, 5 May 2010 19:17:41 +0000 (19:17 +0000)
committerDon Armstrong <don@donarmstrong.com>
Wed, 5 May 2010 19:17:41 +0000 (19:17 +0000)
genotypeizer [new file with mode: 0755]

diff --git a/genotypeizer b/genotypeizer
new file mode 100755 (executable)
index 0000000..00478d0
--- /dev/null
@@ -0,0 +1,254 @@
+#! /usr/bin/perl
+# genotypeizer converts various genotype files into various formats,
+# and is released under the terms of the GPL version 2, or any later
+# version, at your option. See the file README and COPYING for more
+# information. Copyright 2009 by Don Armstrong <don@donarmstrong.com>.
+# $Id: perl_script 1432 2009-04-21 02:42:41Z don $
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+genotypeizer - converts varoius genotype files into various formats
+
+=head1 SYNOPSIS
+
+genotypeizer [options] [files to read]
+
+ Options:
+  --output file to output to; defaults to stdout
+  --output-format format to output to; defaults to R
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+The files to read should be either map/ped pairs or a single snpassoc
+R file.
+
+=head1 OPTIONS
+
+=over
+
+=item B<--output>
+
+File to output to. If the file format that you choose requires
+multiple files, the appropriate extension(s) will be appended to the
+name provided. [If --output isn't provided, STDOUT will be used, which
+will produce suboptimal results in the case of pedmap format.]
+
+=item B<--output-format>
+
+Output format to output. Currently valid values are R or pedmap. (R is
+the format used by the R snpassoc module, and pedmap is the format
+used by phylip.
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+
+=cut
+
+use POSIX qw(:fcntl_h);
+use IO::File;
+
+use vars qw($DEBUG);
+
+my %options = (output_format   => 'R',
+              output          => undef,
+              snps            => undef,
+              debug           => 0,
+              help            => 0,
+              man             => 0,
+              force           => 0,
+              );
+
+GetOptions(\%options,
+          'output=s',
+          'output_format|output-format=s',
+          'snps=s@',
+          'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+
+if ($options{output_format} !~ /^(R|pedmap|mapped)$/i) {
+     push @USAGE_ERRORS,"--output-format must be one of R or pedmap.";
+}
+
+$options{format} = lc($options{format});
+
+if ($options{format} eq 'mapped') {
+    $options{format} = 'pedmap';
+}
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+
+# open up output filehandles
+
+my %output_fh;
+
+my %file_ext= (pedmap => {ped => '.ped',
+                         map => '.map',
+                        },
+              R      => {main => '',
+                        },
+             );
+die "format $options{output_format} doesn't appear to actually be supported; this is a bug."
+    unless exists $file_ext{$options{output_format}};
+
+for my $file_ext (keys %{$file_ext{$options{output_format}}}) {
+    if (defined $options{output}) {
+       $output_fh{$file_ext} = IO::File->new($options{output}.$file_ext{$options{output_format}}{$file_ext},
+                                             O_CREAT|O_EXCL|O_WRONLY)
+           or die "Unable to open $options{output}"."$file_ext{$options{output_format}}{$file_ext} for writing: $!";
+    }
+    else {
+       $output_fh{$file_ext} = \*STDOUT;
+    }
+}
+
+# read it in; currently only support map/ped, but eventually do them
+# all.
+
+my %samples;
+my %extra_info;
+my %snps;
+
+my @map_ped = @ARGV;
+
+my @snp_order;
+
+my ($map_file,$ped_file);
+while (($map_file,$ped_file) = splice @map_ped,0,2) {
+     my $map_fh = IO::File->new($map_file,'r') or die "Unable to open $map_file for reading: $!";
+     my $ped_fh = IO::File->new($ped_file,'r') or die "Unable to open $ped_file for reading: $!";
+     my @map = ();
+     while (<$map_fh>) {
+         chomp;
+         my ($chr,$rsid,$something,$position) = split /\t/;
+         push @map,{chr  => $chr,
+                    rsid => $rsid,
+                    something => $something,
+                    position => $position,
+                   };
+         $snps{$rsid} = {chr  => $chr,
+                         rsid => $rsid,
+                         something => $something,
+                         position => $position,
+                        };
+     }
+     push @snp_order,@map;
+
+     while (<$ped_fh>) {
+         chomp;
+         next unless /^\d+/;
+         my @row = split /\s+/;
+         my ($family_id,$individual_id,$paternal_id,$maternal_id,$sex,$phenotype,@snps) = @row;
+         $extra_info{$individual_id} = {phenotype     => $phenotype,
+                                        sex           => $sex,
+                                        family_id     => $family_id,
+                                        individual_id => $individual_id,
+                                        maternal_id   => $maternal_id,
+                                        paternal_id   => $paternal_id,
+                                       };
+         # do it this way to avoid eating memory
+         $samples{$individual_id} = \@snps;
+         if (@snps != 2*@map) {
+              die "The map file and the number of snps don't appear to match";
+         }
+     }
+}
+
+
+my %valid_snps;
+
+if (defined $options{snps} and @{$options{snps}}) {
+    for my $snp_i (0..$#snp_order) {
+       for my $valid_snp (@{$options{snps}}) {
+           if ($snp_order[$snp_i]{rsid} =~ /(rs)?\Q$valid_snp\E/) {
+               $valid_snps{$snp_order[$snp_i]{rsid}} = $snp_i;
+               last;
+           }
+       }
+    }
+}
+
+my @snp_ordering;
+if (keys %valid_snps) {
+    @snp_ordering = sort {$valid_snps{$a} <=>
+                             $valid_snps{$b}
+                         } keys %valid_snps;
+}
+else {
+    @snp_ordering = map {$_->{rsid}} @snp_order;
+}
+
+
+@valid_snps{@{$options{snps}}} = (1) x $options{snps} if defined $options{snps} and @{$options{snps}};
+
+if ($options{output} eq 'pedmap') {
+    if (keys %valid_snps) {
+       %snps = map {($_,$snps{$_})} keys %valid_snps;
+    }
+    # output the map
+    for my $snp (@snp_ordering) {
+       print {$output_fh{map}} join("\t",@{$snps{$snp}}{qw(chr rsid something position)})."\n";
+    }
+    # output the ped
+    for my $individual_id (sort keys %extra_info) {
+       print {$output_fh{ped}}
+           join("\t",
+                @{$extra_info{$individual_id}}{qw(family_id individual_id),
+                                                   qw(paternal_id maternal_id sex phenotype)},
+                keys %valid_snps ? @{$samples{$individual_id}}[@valid_snps{@snp_ordering}] : @{$samples{$individual_id}}
+               )."\n";
+    }
+}
+else {
+    print {$output_fh{main}}
+       join("\t",
+            qw(individual_id family_id paternal_id maternal_id sex phenotype),
+            sort keys %snps
+           );
+    print {$output_fh{main}} "\n";
+    for my $individual_id (keys %samples) {
+       print {$output_fh{main}}
+           join("\t",
+                @{$extra_info{$individual_id}}{qw(individual_id family_id paternal_id maternal_id sex phenotype)},
+                map {my $a = $samples{$individual_id}{$_};
+                     if (not defined $a or not @$a or $a->[0] eq '0' or $a->[1] eq '0') {
+                         'NA'
+                     } else {
+                         join('',map{uc($_)} @$a);
+                     }
+                 } sort keys %snps
+               );
+       print {$output_fh{main}} "\n";
+    }
+}
+
+
+__END__