X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=genotypeizer;fp=genotypeizer;h=00478d0ac4f140feebbddb93e578a755206ef6a1;hb=35d3d7d8c8f715883a09c9d5fb0f0d6a19d002c5;hp=0000000000000000000000000000000000000000;hpb=ab2490071f13be33714700b79718d5cbb333f93a;p=bin.git diff --git a/genotypeizer b/genotypeizer new file mode 100755 index 0000000..00478d0 --- /dev/null +++ b/genotypeizer @@ -0,0 +1,254 @@ +#! /usr/bin/perl +# genotypeizer converts various genotype files into various formats, +# and is released under the terms of the GPL version 2, or any later +# version, at your option. See the file README and COPYING for more +# information. Copyright 2009 by Don Armstrong . +# $Id: perl_script 1432 2009-04-21 02:42:41Z don $ + + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +genotypeizer - converts varoius genotype files into various formats + +=head1 SYNOPSIS + +genotypeizer [options] [files to read] + + Options: + --output file to output to; defaults to stdout + --output-format format to output to; defaults to R + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +The files to read should be either map/ped pairs or a single snpassoc +R file. + +=head1 OPTIONS + +=over + +=item B<--output> + +File to output to. If the file format that you choose requires +multiple files, the appropriate extension(s) will be appended to the +name provided. [If --output isn't provided, STDOUT will be used, which +will produce suboptimal results in the case of pedmap format.] + +=item B<--output-format> + +Output format to output. Currently valid values are R or pedmap. (R is +the format used by the R snpassoc module, and pedmap is the format +used by phylip. + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief usage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + +=cut + +use POSIX qw(:fcntl_h); +use IO::File; + +use vars qw($DEBUG); + +my %options = (output_format => 'R', + output => undef, + snps => undef, + debug => 0, + help => 0, + man => 0, + force => 0, + ); + +GetOptions(\%options, + 'output=s', + 'output_format|output-format=s', + 'snps=s@', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +my @USAGE_ERRORS; + +if ($options{output_format} !~ /^(R|pedmap|mapped)$/i) { + push @USAGE_ERRORS,"--output-format must be one of R or pedmap."; +} + +$options{format} = lc($options{format}); + +if ($options{format} eq 'mapped') { + $options{format} = 'pedmap'; +} + +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + + +# open up output filehandles + +my %output_fh; + +my %file_ext= (pedmap => {ped => '.ped', + map => '.map', + }, + R => {main => '', + }, + ); +die "format $options{output_format} doesn't appear to actually be supported; this is a bug." + unless exists $file_ext{$options{output_format}}; + +for my $file_ext (keys %{$file_ext{$options{output_format}}}) { + if (defined $options{output}) { + $output_fh{$file_ext} = IO::File->new($options{output}.$file_ext{$options{output_format}}{$file_ext}, + O_CREAT|O_EXCL|O_WRONLY) + or die "Unable to open $options{output}"."$file_ext{$options{output_format}}{$file_ext} for writing: $!"; + } + else { + $output_fh{$file_ext} = \*STDOUT; + } +} + +# read it in; currently only support map/ped, but eventually do them +# all. + +my %samples; +my %extra_info; +my %snps; + +my @map_ped = @ARGV; + +my @snp_order; + +my ($map_file,$ped_file); +while (($map_file,$ped_file) = splice @map_ped,0,2) { + my $map_fh = IO::File->new($map_file,'r') or die "Unable to open $map_file for reading: $!"; + my $ped_fh = IO::File->new($ped_file,'r') or die "Unable to open $ped_file for reading: $!"; + my @map = (); + while (<$map_fh>) { + chomp; + my ($chr,$rsid,$something,$position) = split /\t/; + push @map,{chr => $chr, + rsid => $rsid, + something => $something, + position => $position, + }; + $snps{$rsid} = {chr => $chr, + rsid => $rsid, + something => $something, + position => $position, + }; + } + push @snp_order,@map; + + while (<$ped_fh>) { + chomp; + next unless /^\d+/; + my @row = split /\s+/; + my ($family_id,$individual_id,$paternal_id,$maternal_id,$sex,$phenotype,@snps) = @row; + $extra_info{$individual_id} = {phenotype => $phenotype, + sex => $sex, + family_id => $family_id, + individual_id => $individual_id, + maternal_id => $maternal_id, + paternal_id => $paternal_id, + }; + # do it this way to avoid eating memory + $samples{$individual_id} = \@snps; + if (@snps != 2*@map) { + die "The map file and the number of snps don't appear to match"; + } + } +} + + +my %valid_snps; + +if (defined $options{snps} and @{$options{snps}}) { + for my $snp_i (0..$#snp_order) { + for my $valid_snp (@{$options{snps}}) { + if ($snp_order[$snp_i]{rsid} =~ /(rs)?\Q$valid_snp\E/) { + $valid_snps{$snp_order[$snp_i]{rsid}} = $snp_i; + last; + } + } + } +} + +my @snp_ordering; +if (keys %valid_snps) { + @snp_ordering = sort {$valid_snps{$a} <=> + $valid_snps{$b} + } keys %valid_snps; +} +else { + @snp_ordering = map {$_->{rsid}} @snp_order; +} + + +@valid_snps{@{$options{snps}}} = (1) x $options{snps} if defined $options{snps} and @{$options{snps}}; + +if ($options{output} eq 'pedmap') { + if (keys %valid_snps) { + %snps = map {($_,$snps{$_})} keys %valid_snps; + } + # output the map + for my $snp (@snp_ordering) { + print {$output_fh{map}} join("\t",@{$snps{$snp}}{qw(chr rsid something position)})."\n"; + } + # output the ped + for my $individual_id (sort keys %extra_info) { + print {$output_fh{ped}} + join("\t", + @{$extra_info{$individual_id}}{qw(family_id individual_id), + qw(paternal_id maternal_id sex phenotype)}, + keys %valid_snps ? @{$samples{$individual_id}}[@valid_snps{@snp_ordering}] : @{$samples{$individual_id}} + )."\n"; + } +} +else { + print {$output_fh{main}} + join("\t", + qw(individual_id family_id paternal_id maternal_id sex phenotype), + sort keys %snps + ); + print {$output_fh{main}} "\n"; + for my $individual_id (keys %samples) { + print {$output_fh{main}} + join("\t", + @{$extra_info{$individual_id}}{qw(individual_id family_id paternal_id maternal_id sex phenotype)}, + map {my $a = $samples{$individual_id}{$_}; + if (not defined $a or not @$a or $a->[0] eq '0' or $a->[1] eq '0') { + 'NA' + } else { + join('',map{uc($_)} @$a); + } + } sort keys %snps + ); + print {$output_fh{main}} "\n"; + } +} + + +__END__