From: martinahansen Date: Mon, 28 Jul 2008 12:23:23 +0000 (+0000) Subject: added new biopiece merge_records X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=706d1e4f39737d7dd0d2b8d812a977590d4e5310;p=biopieces.git added new biopiece merge_records git-svn-id: http://biopieces.googlecode.com/svn/trunk@197 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/merge_records b/bp_bin/merge_records new file mode 100755 index 0000000..4cd1d44 --- /dev/null +++ b/bp_bin/merge_records @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biopieces; diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index b57f279..0a6c847 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -236,6 +236,7 @@ sub run_script elsif ( $script eq "rename_keys" ) { script_rename_keys( $in, $out, $options ) } elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) } elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) } + elsif ( $script eq "merge_records" ) { script_merge_records( $in, $out, $options ) } elsif ( $script eq "grab" ) { script_grab( $in, $out, $options ) } elsif ( $script eq "compute" ) { script_compute( $in, $out, $options ) } elsif ( $script eq "flip_tab" ) { script_flip_tab( $in, $out, $options ) } @@ -749,6 +750,12 @@ sub get_options delimit|d=s ); } + elsif ( $script eq "merge_records" ) + { + @options = qw( + keys|k=s + ); + } elsif ( $script eq "grab" ) { @options = qw( @@ -4595,6 +4602,132 @@ sub script_merge_vals } +sub script_merge_records +{ + # Martin A. Hansen, July 2008. + + # Merges records in the stream based on identical values of two given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file1, $file2, $fh1, $fh2, $key1, $key2, @keys1, @keys2, @vals1, @vals2, + $num1, $num2, $num, $cmp, $i ); + + $file1 = "$BP_TMP/merge_records1.tmp"; + $file2 = "$BP_TMP/merge_records2.tmp"; + + $fh1 = Maasha::Common::write_open( $file1 ); + $fh2 = Maasha::Common::write_open( $file2 ); + + $key1 = $options->{ "keys" }->[ 0 ]; + $key2 = $options->{ "keys" }->[ 1 ]; + + $num = $key2 =~ s/n$//; + $num1 = 0; + $num2 = 0; + + while ( $record = get_record( $in ) ) + { + if ( exists $record->{ $key1 } ) + { + @keys1 = $key1; + @vals1 = $record->{ $key1 }; + + delete $record->{ $key1 }; + + map { push @keys1, $_; push @vals1, $record->{ $_ } } keys %{ $record }; + + print $fh1 join( "\t", @vals1 ), "\n"; + + $num1++; + } + elsif ( exists $record->{ $key2 } ) + { + @keys2 = $key2; + @vals2 = $record->{ $key2 }; + + delete $record->{ $key2 }; + + map { push @keys2, $_; push @vals2, $record->{ $_ } } keys %{ $record }; + + print $fh2 join( "\t", @vals2 ), "\n"; + + $num2++; + } + } + + close $fh1; + close $fh2; + + if ( $num ) + { + Maasha::Common::run( "sort", "-k 1,1n $file1 > $file1.sort" ) and rename "$file1.sort", $file1; + Maasha::Common::run( "sort", "-k 1,1n $file2 > $file2.sort" ) and rename "$file2.sort", $file2; + } + else + { + Maasha::Common::run( "sort", "-k 1,1 $file1 > $file1.sort" ) and rename "$file1.sort", $file1; + Maasha::Common::run( "sort", "-k 1,1 $file2 > $file2.sort" ) and rename "$file2.sort", $file2; + } + + $fh1 = Maasha::Common::read_open( $file1 ); + $fh2 = Maasha::Common::read_open( $file2 ); + + @vals1 = Maasha::Common::get_fields( $fh1 ); + @vals2 = Maasha::Common::get_fields( $fh2 ); + + while ( $num1 > 0 and $num2 > 0 ) + { + if ( $num ) { + $cmp = $vals1[ 0 ] <=> $vals2[ 0 ]; + } else { + $cmp = $vals1[ 0 ] cmp $vals2[ 0 ]; + } + + if ( $cmp < 0 ) + { + @vals1 = Maasha::Common::get_fields( $fh1 ); + $num1--; + } + elsif ( $cmp > 0 ) + { + @vals2 = Maasha::Common::get_fields( $fh2 ); + $num2--; + } + else + { + undef $record; + + for ( $i = 0; $i < @keys1; $i++ ) { + $record->{ $keys1[ $i ] } = $vals1[ $i ]; + } + + for ( $i = 1; $i < @keys2; $i++ ) { + $record->{ $keys2[ $i ] } = $vals2[ $i ]; + } + + put_record( $record, $out ); + + @vals1 = Maasha::Common::get_fields( $fh1 ); + @vals2 = Maasha::Common::get_fields( $fh2 ); + $num1--; + $num2--; + } + } + + close $fh1; + close $fh2; + + unlink $file1; + unlink $file2; +} + + sub script_grab { # Martin A. Hansen, August 2007. @@ -5090,6 +5223,7 @@ sub script_count_vals $fh_out = Maasha::Common::write_open( $tmp_file ); + $cache = 0; $num = 0; while ( $record = get_record( $in ) ) @@ -5116,7 +5250,7 @@ sub script_count_vals if ( $cache ) { - $num = 0; + $num = 0; $fh_in = Maasha::Common::read_open( $tmp_file ); diff --git a/code_perl/Maasha/Common.pm b/code_perl/Maasha/Common.pm index ae2be08..8428aa6 100644 --- a/code_perl/Maasha/Common.pm +++ b/code_perl/Maasha/Common.pm @@ -574,6 +574,36 @@ sub get_basename } +sub get_fields +{ + # Martin A. Hansen, July 2008. + + # Given a filehandle to a file gets the + # next line which is split into a list of + # fields that is returned. + + my ( $fh, # filehandle + $delimiter, # field seperator - OPTIONAL + ) = @_; + + # Returns a list. + + my ( $line, @fields ); + + $line = <$fh>; + + chomp $line; + + return if not $line; + + $delimiter ||= "\t"; + + @fields = split "$delimiter", $line; + + return wantarray ? @fields : \@fields; +} + + sub file_read { # Martin A. Hansen, December 2004. diff --git a/code_perl/Maasha/Matrix.pm b/code_perl/Maasha/Matrix.pm index 941ec82..74df868 100644 --- a/code_perl/Maasha/Matrix.pm +++ b/code_perl/Maasha/Matrix.pm @@ -969,6 +969,80 @@ sub tabulate } +sub merge_tabs +{ + # Martin A. Hansen, July 2008. + + # Merge two given tables based on identifiers in a for each table + # specified column which should contain a unique identifier. + # Initially the tables are sorted and tab2 is merged onto tab1 + # row-wise. + + my ( $tab1, # table 1 - an AoA. + $tab2, # table 2 - an AoA. + $col1, # identifier in row1 + $col2, # identifier in row2 + $sort_type, # alphabetical or numeric comparison + ) = @_; + + # Returns nothing. + + my ( $num, $cmp, $i, $c, @row_cpy, $max ); + + $max = 0; + $num = 0; + + if ( $sort_type =~ /num/i ) + { + $num = 1; + + @{ $tab1 } = sort { $a->[ $col1 ] <=> $b->[ $col1 ] } @{ $tab1 }; + @{ $tab2 } = sort { $a->[ $col2 ] <=> $b->[ $col2 ] } @{ $tab2 }; + } + else + { + @{ $tab1 } = sort { $a->[ $col1 ] cmp $b->[ $col1 ] } @{ $tab1 }; + @{ $tab2 } = sort { $a->[ $col2 ] cmp $b->[ $col2 ] } @{ $tab2 }; + } + + $i = 0; + $c = 0; + + while ( $i < @{ $tab1 } and $c < @{ $tab2 } ) + { + if ( $num ) { + $cmp = $tab1->[ $i ]->[ $col1 ] <=> $tab2->[ $c ]->[ $col2 ]; + } else { + $cmp = $tab1->[ $i ]->[ $col1 ] cmp $tab2->[ $c ]->[ $col2 ]; + } + + if ( $cmp == 0 ) + { + @row_cpy = @{ $tab2->[ $c ] }; + + splice @row_cpy, $col2, 1; + + push @{ $tab1->[ $i ] }, @row_cpy; + + $i++; + $c++; + } + elsif ( $cmp > 0 ) + { + $c++; + } + else + { + map { push @{ $tab1->[ $i ] }, "null" } 0 .. ( scalar @{ $tab2->[ $c ] } - 2 ); + + $i++; + } + } + + map { push @{ $tab1->[ -1 ] }, "null" } 0 .. ( scalar @{ $tab1->[ 0 ] } - scalar @{ $tab1->[ -1 ] } + 1 ); +} + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BINARY SEARCH <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<