From: martinahansen Date: Tue, 16 Sep 2008 00:06:05 +0000 (+0000) Subject: minor updates X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=b871b345e7e2cb57c0d6e7b27ddbbdadbffb96ac;p=biopieces.git minor updates git-svn-id: http://biopieces.googlecode.com/svn/trunk@257 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_c/Maasha/src/bed_sort.c b/code_c/Maasha/src/bed_sort.c index d8e84eb..851a74e 100644 --- a/code_c/Maasha/src/bed_sort.c +++ b/code_c/Maasha/src/bed_sort.c @@ -4,19 +4,105 @@ #include "list.h" #include "ucsc.h" +static void usage() +{ + fprintf( stderr, + "\n" + "bed_sort - sorts a BED file.\n" + "\n" + "Usage: bed_sort [options] \n" + "\n" + "Options:\n" + " [-s | --sort ] # 1: chr AND chr_beg.\n" + " # 2: chr AND strand AND chr_beg.\n" + " # 3: chr_beg.\n" + " # 4: strand AND chr_beg.\n" + " [-c | --cols ] # Number of columns to read (default all).\n" + " [-d | --dir ] # Directory to use for file bound sorting.\n" + "\n" + "Examples:\n" + " bed_sort test.bed > test.bed.sort\n" + "\n" + ); + + exit( EXIT_FAILURE ); +} + + +static struct option longopts[] = { + { "sort", required_argument, NULL, 's' }, + { "cols", required_argument, NULL, 'c' }, + { "dir", required_argument, NULL, 'd' }, + { NULL, 0, NULL, 0 } +}; + int main( int argc, char *argv[] ) { + int opt = 0; + int sort = 1; + int cols = 0; + char *dir = NULL; char *file = NULL; list_sl *entries = NULL; - file = argv[ 1 ]; + while ( ( opt = getopt_long( argc, argv, "n:l", longopts, NULL ) ) != -1 ) + { + switch ( opt ) { + case 's': sort = strtol( optarg, NULL, 0 ); break; + case 'c': cols = strtol( optarg, NULL, 0 ); break; + case 'd': dir = optarg; break; + default: break; + } + } + + printf( "sort: %d cols: %d dir: %s\n", sort, cols, dir ); + + argc -= optind; + argv += optind; + + if ( sort < 1 || sort > 4 ) + { + fprintf( stderr, "ERROR: argument to --sort must be 1, 2, 3 or 4 - not: %d\n", sort ); + abort(); + } + + if ( cols != 0 && cols != 3 && cols != 4 && cols != 5 && cols != 6 && cols != 12 ) + { + fprintf( stderr, "ERROR: argument to --cols must be 3, 4, 5, 6 or 12 - not: %d\n", cols ); + abort(); + } + + if ( ( sort == 2 || sort == 4 ) && ( cols > 0 && cols < 6 ) ) + { + fprintf( stderr, "ERROR: cannot sort on strand with cols (%d) less than 6\n", cols ); + abort(); + } + + if ( dir != NULL ) + { + fprintf( stderr, "ERROR: directory: %s does not exists\n", dir ); + abort(); + } + + if ( argc < 1 ) { + usage(); + } + + file = argv[ argc - 1 ]; - entries = bed_entries_get( file, 3 ); + entries = bed_entries_get( file, cols ); - list_sl_sort( &entries, cmp_bed_sort_chr_beg ); + switch ( sort ) + { + case 1: list_sl_sort( &entries, cmp_bed_sort_chr_beg ); break; + case 2: list_sl_sort( &entries, cmp_bed_sort_chr_strand_beg ); break; + case 3: list_sl_sort( &entries, cmp_bed_sort_beg ); break; + case 4: list_sl_sort( &entries, cmp_bed_sort_strand_beg ); break; + default: break; + } - bed_entries_put( entries, 3 ); + bed_entries_put( entries, cols ); return EXIT_SUCCESS; } diff --git a/code_c/Maasha/src/inc/common.h b/code_c/Maasha/src/inc/common.h index 83360ff..4d33bf7 100644 --- a/code_c/Maasha/src/inc/common.h +++ b/code_c/Maasha/src/inc/common.h @@ -10,14 +10,24 @@ #include #include #include +#include typedef unsigned char uchar; typedef unsigned short ushort; typedef long long llong; -typedef char bool; +typedef char boolean; +#ifndef bool +#define bool boolean +#endif + +#ifndef TRUE #define TRUE 1 +#endif + +#ifndef FALSE #define FALSE 0 +#endif /* Macros for determining min or max of two given values. */ #define MAX( a, b ) a < b ? b : a @@ -29,12 +39,18 @@ typedef char bool; /* Neat debug macro. */ #define DEBUG_EXIT 0 + +#ifndef die #define die assert( DEBUG_EXIT ) +#endif /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MISC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ +/* Function that prints "pong" to stderr. */ +void maasha_ping(); + /* Return a binary number as a string of 1's and 0's. */ char *bits2string( uint bin ); diff --git a/code_c/Maasha/src/inc/ucsc.h b/code_c/Maasha/src/inc/ucsc.h index 6e655c5..291c05b 100644 --- a/code_c/Maasha/src/inc/ucsc.h +++ b/code_c/Maasha/src/inc/ucsc.h @@ -7,30 +7,81 @@ #define BED_BLOCKSIZES_MAX 256 #define BED_QBEGS_MAX 256 +/* Structure of a BED entry with the 12 BED elements. */ +/* http://genome.ucsc.edu/FAQ/FAQformat#format1 */ struct _bed_entry { - int cols; - char *chr; - uint chr_beg; - uint chr_end; - char *q_id; - int score; - char strand; - uint thick_beg; - uint thick_end; - char *itemrgb; - uint blockcount; - char *blocksizes; - char *q_begs; + int cols; /* Number of BED elements used. */ + char *chr; /* Chromosome name. */ + uint chr_beg; /* Chromosome begin position. */ + uint chr_end; /* Chromosome end position. */ + char *q_id; /* Query ID. */ + int score; /* Score. */ + char strand; /* Strand. */ + uint thick_beg; /* Begin position of thick drawing. */ + uint thick_end; /* End position of thick drawing. */ + char *itemrgb; /* RGB color (255,255,255).*/ + uint blockcount; /* Number of blocks (exons). */ + char *blocksizes; /* Comma separated string of blocks sizes. */ + char *q_begs; /* Comma separated string of block begins. */ }; typedef struct _bed_entry bed_entry; +/* Returns a new BED entry with memory allocated for */ +/* a given number of columns. */ bed_entry *bed_entry_new( const int cols ); + +/* Free memory for a BED entry. */ +void bed_entry_destroy( bed_entry *entry ); + +/* Get next BED entry of a given number of columns from a file pointer. */ bed_entry *bed_entry_get( FILE *fp, const int cols ); + +/* Get a singly linked list with all BED entries (of a given number of coluns */ +/* from a specified file. */ list_sl *bed_entries_get( char *path, const int cols ); + +/* Output a given number of columns from a BED entry to stdout. */ void bed_entry_put( bed_entry *entry, int cols ); + +/* Output a given number of columns from all BED entries */ +/* in a singly linked list. */ void bed_entries_put( list_sl *entries, int cols ); + +/* Free memory for all BED entries and list nodes. */ +void bed_entries_destroy( list_sl **entries_ppt ); + +/* Given a path to a BED file, read the given number of cols */ +/* according to the begin position. The result is written to stdout. */ +void bed_file_sort_beg( char *path, int cols ); + +/* Given a path to a BED file, read the given number of cols */ +/* according to the strand AND begin position. The result is written to stdout. */ +void bed_file_sort_strand_beg( char *path, int cols ); + +/* Given a path to a BED file, read the given number of cols */ +/* according to the chromosome AND begin position. The result is written to stdout. */ +void bed_file_sort_chr_beg( char *path, int cols ); + +/* Given a path to a BED file, read the given number of cols */ +/* according to the chromosome AND strand AND begin position. The result is written to stdout. */ +void bed_file_sort_chr_strand_beg( char *path, int cols ); + +/* Compare function for sorting a singly linked list of BED entries */ +/* according to begin position. */ int cmp_bed_sort_beg( const void *a, const void *b ); + +/* Compare function for sorting a singly linked list of BED entries */ +/* according to strand AND begin position. */ +int cmp_bed_sort_strand_beg( const void *a, const void *b ); + +/* Compare function for sorting a singly linked list of BED entries */ +/* according to chromosome name AND begin position. */ int cmp_bed_sort_chr_beg( const void *a, const void *b ); + +/* Compare function for sorting a singly linked list of BED entries */ +/* according to chromosome name AND strand AND begin position. */ int cmp_bed_sort_chr_strand_beg( const void *a, const void *b ); + + diff --git a/code_c/Maasha/src/lib/common.c b/code_c/Maasha/src/lib/common.c index 41dc8c2..2b9acb5 100644 --- a/code_c/Maasha/src/lib/common.c +++ b/code_c/Maasha/src/lib/common.c @@ -8,6 +8,16 @@ /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MISC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ +void maasha_ping() +{ + /* Martin A. Hansen, Septermber 2008 */ + + /* Function that prints "pong" to stderr. */ + + fprintf( stderr, "pong\n" ); +} + + char *bits2string( uint bin ) { /* Martin A. Hansen, June 2008 */ diff --git a/code_c/Maasha/src/lib/ucsc.c b/code_c/Maasha/src/lib/ucsc.c index bdb722a..89d8083 100644 --- a/code_c/Maasha/src/lib/ucsc.c +++ b/code_c/Maasha/src/lib/ucsc.c @@ -11,6 +11,11 @@ bed_entry *bed_entry_new( const int cols ) { + /* Martin A. Hansen, September 2008 */ + + /* Returns a new BED entry with memory allocated for */ + /* a given number of columns. */ + bed_entry *entry = mem_get( sizeof( bed_entry ) ); entry->cols = cols; @@ -43,7 +48,7 @@ bed_entry *bed_entry_new( const int cols ) entry->thick_beg = 0; entry->thick_end = 0; entry->itemrgb = mem_get( BED_ITEMRGB_MAX ); - entry->blockcount = 0;; + entry->blockcount = 0; entry->blocksizes = mem_get( BED_BLOCKSIZES_MAX ); entry->q_begs = mem_get( BED_QBEGS_MAX ); @@ -51,8 +56,42 @@ bed_entry *bed_entry_new( const int cols ) } +void bed_entry_destroy( bed_entry *entry ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Free memory for a BED entry. */ + + int cols = entry->cols; + + if ( cols > 6 ) + { + free( entry->itemrgb ); + free( entry->blocksizes ); + free( entry->q_begs ); + free( entry->q_id ); + free( entry->chr ); + } + else if ( cols > 3 ) + { + free( entry->q_id ); + free( entry->chr ); + } + else + { + free( entry->chr ); + } + + mem_free( &entry ); +} + + bed_entry *bed_entry_get( FILE *fp, int cols ) { + /* Martin A. Hansen, September 2008 */ + + /* Get next BED entry of a given number of columns from a file pointer. */ + bed_entry *entry = bed_entry_new( cols ); char buffer[ BED_BUFFER ]; @@ -153,6 +192,11 @@ bed_entry *bed_entry_get( FILE *fp, int cols ) list_sl *bed_entries_get( char *path, const int cols ) { + /* Martin A. Hansen, September 2008 */ + + /* Get a singly linked list with all BED entries (of a given number of coluns */ + /* from a specified file. */ + list_sl *list = list_sl_new(); node_sl *node = node_sl_new(); node_sl *old_node = NULL; @@ -189,6 +233,10 @@ list_sl *bed_entries_get( char *path, const int cols ) void bed_entry_put( bed_entry *entry, int cols ) { + /* Martin A. Hansen, September 2008 */ + + /* Output a given number of columns from a BED entry to stdout. */ + if ( ! cols ) { cols = entry->cols; } @@ -264,6 +312,11 @@ void bed_entry_put( bed_entry *entry, int cols ) void bed_entries_put( list_sl *entries, int cols ) { + /* Martin A. Hansen, September 2008 */ + + /* Output a given number of columns from all BED entries */ + /* in a singly linked list. */ + node_sl *node = NULL; for ( node = entries->first; node != NULL; node = node->next ) { @@ -272,8 +325,122 @@ void bed_entries_put( list_sl *entries, int cols ) } +void bed_entries_destroy( list_sl **entries_ppt ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Free memory for all BED entries and list nodes. */ + + list_sl *entries = *entries_ppt; + node_sl *node = NULL; + node_sl *next = NULL; + + next = entries->first; + + while ( next != NULL ) + { + node = next; + + bed_entry_destroy( ( bed_entry * ) node->val ); + + next = node->next; + + mem_free( &node ); + } + + mem_free( &entries ); + + *entries_ppt = NULL; +} + + +void bed_file_sort_beg( char *path, int cols ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Given a path to a BED file, read the given number of cols */ + /* according to the begin position. The result is written to stdout. */ + + list_sl *entries = NULL; + + entries = bed_entries_get( path, cols ); + + list_sl_sort( &entries, cmp_bed_sort_beg ); + + bed_entries_put( entries, cols ); + + bed_entries_destroy( &entries ); +} + + +void bed_file_sort_strand_beg( char *path, int cols ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Given a path to a BED file, read the given number of cols */ + /* according to the strand AND begin position. The result is written to stdout. */ + + assert( cols >= 6 ); + + list_sl *entries = NULL; + + entries = bed_entries_get( path, cols ); + + list_sl_sort( &entries, cmp_bed_sort_strand_beg ); + + bed_entries_put( entries, cols ); + + bed_entries_destroy( &entries ); +} + + +void bed_file_sort_chr_beg( char *path, int cols ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Given a path to a BED file, read the given number of cols */ + /* according to the chromosome AND begin position. The result is written to stdout. */ + + list_sl *entries = NULL; + + entries = bed_entries_get( path, cols ); + + list_sl_sort( &entries, cmp_bed_sort_chr_beg ); + + bed_entries_put( entries, cols ); + + bed_entries_destroy( &entries ); +} + + +void bed_file_sort_chr_strand_beg( char *path, int cols ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Given a path to a BED file, read the given number of cols */ + /* according to the chromosome AND strand AND begin position. The result is written to stdout. */ + + assert( cols >= 6 ); + + list_sl *entries = NULL; + + entries = bed_entries_get( path, cols ); + + list_sl_sort( &entries, cmp_bed_sort_chr_strand_beg ); + + bed_entries_put( entries, cols ); + + bed_entries_destroy( &entries ); +} + + int cmp_bed_sort_beg( const void *a, const void *b ) { + /* Martin A. Hansen, September 2008 */ + + /* Compare function for sorting a singly linked list of BED entries */ + /* according to begin position. */ + node_sl *a_node = *( ( node_sl ** ) a ); node_sl *b_node = *( ( node_sl ** ) b ); @@ -290,8 +457,40 @@ int cmp_bed_sort_beg( const void *a, const void *b ) } +int cmp_bed_sort_strand_beg( const void *a, const void *b ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Compare function for sorting a singly linked list of BED entries */ + /* according to strand AND begin position. */ + + node_sl *a_node = *( ( node_sl ** ) a ); + node_sl *b_node = *( ( node_sl ** ) b ); + + bed_entry *a_entry = ( bed_entry * ) a_node->val; + bed_entry *b_entry = ( bed_entry * ) b_node->val; + + if ( a_entry->strand < b_entry->strand ) { + return -1; + } else if ( a_entry->strand > b_entry->strand ) { + return 1; + } else if ( a_entry->chr_beg < b_entry->chr_beg ) { + return -1; + } else if ( a_entry->chr_beg > b_entry->chr_beg ) { + return 1; + } else { + return 0; + } +} + + int cmp_bed_sort_chr_beg( const void *a, const void *b ) { + /* Martin A. Hansen, September 2008 */ + + /* Compare function for sorting a singly linked list of BED entries */ + /* according to chromosome name AND begin position. */ + node_sl *a_node = *( ( node_sl ** ) a ); node_sl *b_node = *( ( node_sl ** ) b ); @@ -318,6 +517,11 @@ int cmp_bed_sort_chr_beg( const void *a, const void *b ) int cmp_bed_sort_chr_strand_beg( const void *a, const void *b ) { + /* Martin A. Hansen, September 2008 */ + + /* Compare function for sorting a singly linked list of BED entries */ + /* according to chromosome name AND strand AND begin position. */ + node_sl *a_node = *( ( node_sl ** ) a ); node_sl *b_node = *( ( node_sl ** ) b ); diff --git a/code_c/Maasha/src/repeat-O-matic.c b/code_c/Maasha/src/repeat-O-matic.c index 5f430d3..72dbf3a 100644 --- a/code_c/Maasha/src/repeat-O-matic.c +++ b/code_c/Maasha/src/repeat-O-matic.c @@ -1,6 +1,5 @@ /* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */ -#include #include "common.h" #include "mem.h" #include "filesys.h" diff --git a/code_c/Maasha/src/test/test_ucsc.c b/code_c/Maasha/src/test/test_ucsc.c index 4ef1cd5..7153503 100644 --- a/code_c/Maasha/src/test/test_ucsc.c +++ b/code_c/Maasha/src/test/test_ucsc.c @@ -5,7 +5,12 @@ static void test_bed_entry_get(); static void test_bed_entries_get(); +static void test_bed_entries_destroy(); static void test_bed_entries_sort(); +static void test_bed_file_sort_beg(); +static void test_bed_file_sort_strand_beg(); +static void test_bed_file_sort_chr_beg(); +static void test_bed_file_sort_chr_strand_beg(); int main() @@ -14,7 +19,12 @@ int main() test_bed_entry_get(); test_bed_entries_get(); + test_bed_entries_destroy(); test_bed_entries_sort(); + test_bed_file_sort_beg(); + test_bed_file_sort_strand_beg(); + test_bed_file_sort_chr_beg(); + test_bed_file_sort_chr_strand_beg(); fprintf( stderr, "Done\n\n" ); @@ -52,7 +62,24 @@ void test_bed_entries_get() entries = bed_entries_get( path, 0 ); - bed_entries_put( entries, 0 ); +// bed_entries_put( entries, 0 ); + + fprintf( stderr, "OK\n" ); +} + + +void test_bed_entries_destroy() +{ + fprintf( stderr, " Testing bed_entries_destroy ... " ); + + char *path = "test/test_files/test12.bed"; + list_sl *entries = NULL; + + entries = bed_entries_get( path, 0 ); + + bed_entries_destroy( &entries ); + + assert( entries == NULL ); fprintf( stderr, "OK\n" ); } @@ -70,8 +97,57 @@ void test_bed_entries_sort() list_sl_sort( &entries, cmp_bed_sort_chr_beg ); list_sl_sort( &entries, cmp_bed_sort_chr_strand_beg ); - bed_entries_put( entries, 0 ); +// bed_entries_put( entries, 0 ); + + fprintf( stderr, "OK\n" ); +} + + +void test_bed_file_sort_beg() +{ + fprintf( stderr, " Testing bed_file_sort_beg ... " ); + +// char *path = "test/test_files/test12.bed"; + +// bed_file_sort_beg( path, 3 ); + + fprintf( stderr, "OK\n" ); +} + + +void test_bed_file_sort_strand_beg() +{ + fprintf( stderr, " Testing bed_file_sort_strand_beg ... " ); + +// char *path = "test/test_files/test12.bed"; + +// bed_file_sort_strand_beg( path, 6 ); fprintf( stderr, "OK\n" ); } + +void test_bed_file_sort_chr_beg() +{ + fprintf( stderr, " Testing bed_file_sort_chr_beg ... " ); + +// char *path = "test/test_files/test12.bed"; + +// bed_file_sort_chr_beg( path, 6 ); + + fprintf( stderr, "OK\n" ); +} + + +void test_bed_file_sort_chr_strand_beg() +{ + fprintf( stderr, " Testing bed_file_sort_chr_strand_beg ... " ); + +// char *path = "test/test_files/test12.bed"; + +// bed_file_sort_chr_strand_beg( path, 6 ); + + fprintf( stderr, "OK\n" ); +} + + diff --git a/code_perl/Maasha/C_code.pm b/code_perl/Maasha/C_code.pm new file mode 100644 index 0000000..0e17dad --- /dev/null +++ b/code_perl/Maasha/C_code.pm @@ -0,0 +1,56 @@ +package Maasha::C_code; + +# Copyright (C) 2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# This modules make all c functions visable to Perl. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; + +use vars qw ( @ISA @EXPORT ); + +@ISA = qw( Exporter ); + +use Inline ( C => Config => + DIRECTORY => $ENV{ 'BP_TMP' }, +# PREFIX => "MAASHA_", + ENABLE => AUTOWRAP => + LIBS => "-L$ENV{ 'BP_C' }/Maasha/src/lib -lcommon", + INC => "-I$ENV{ 'BP_C' }/Maasha/src/inc", + ); + +use Inline C => << 'END_C'; + +void maasha_ping(); + +END_C + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +maasha_ping(); + +1; diff --git a/code_perl/Maasha/UCSC.pm b/code_perl/Maasha/UCSC.pm index 4f272fd..873c0a5 100644 --- a/code_perl/Maasha/UCSC.pm +++ b/code_perl/Maasha/UCSC.pm @@ -44,6 +44,19 @@ use constant { CHR_END => 2, INDEX_BEG => 3, INDEX_LEN => 4, + + CHR => 0, + CHR_BEG => 1, + CHR_END => 2, + Q_ID => 3, + SCORE => 4, + STRAND => 5, + THICK_BEG => 6, + THICK_END => 7, + ITEMRGB => 8, + BLOCKCOUNT => 9, + BLOCKSIZES => 10, + Q_BEGS => 11, }; @ISA = qw( Exporter ); @@ -57,6 +70,45 @@ my $TIME = gettimeofday(); # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED +sub bed_entry_get_array +{ + # Martin A. Hansen, September 2008. + + # Reads a BED entry given a filehandle. + + # This is a new _faster_ BED entry parser that + # uses arrays and not hashrefs. + + # IMPORTANT! This function does not correct the + # BED_END position that is kept the way UCSC + # does. + + my ( $fh, # file handle + $cols, # columns to read - OPTIONAL (3,4,5,6, or 12) + ) = @_; + + # Returns a list. + + my ( $line, @entry ); + + $line = <$fh>; + + $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr + + return if not defined $line; + + if ( not defined $cols ) { + $cols = 1 + $line =~ tr/\t//; + } + + @entry = split "\t", $line, $cols + 1; + + pop @entry if scalar @entry > $cols; + + return wantarray ? @entry : \@entry; +} + + sub bed_get_entry { # Martin A. Hansen, December 2007. @@ -176,6 +228,33 @@ sub bed_get_entries } +sub bed_entry_put_array +{ + # Martin A. Hansen, Septermber 2008. + + # Writes a BED entry array to file. + + # IMPORTANT! This function does not correct the + # BED_END position that is assumed to be in the + # UCSC positions scheme. + + my ( $record, # list + $fh, # file handle - OPTIONAL + $cols, # number of columns in BED file - OPTIONAL + ) = @_; + + # Returns nothing. + + $fh = \*STDOUT if not defined $fh; + + if ( defined $cols ) { + print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n"; + } else { + print $fh join( "\t", @{ $record } ), "\n"; + } +} + + sub bed_put_entry { # Martin A. Hansen, Septermber 2007. @@ -346,55 +425,66 @@ sub bed_analyze sub bed_sort { - # Martin A. Hansen, March 2008. + # Martin A. Hansen, Septermber 2008 - # Sort a potential huge BED file according to - # CHR, CHR_BEG and optionally STRAND. + # Sorts a BED file using the c program + # "bed_sort" specifing a sort mode: - my ( $tmp_dir, # temporary directory used for sorting - $file, # BED file to sort - $strand, # flag to sort on strand - OPTIONAL - ) = @_; + # 1: chr AND chr_beg. + # 2: chr AND strand AND chr_beg. + # 3: chr_beg. + # 4: strand AND chr_beg. - # Returns nothing. + my ( $bed_file, # BED file to sort + $sort_mode, # See above. + $cols, # Number of columns in BED file + ) = @_; - my ( $fh_in, $key, $fh_out, %fh_hash, $part_file, $entry, $entries ); + &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" ); +} - $fh_in = Maasha::Common::read_open( $file ); - while ( $entry = bed_get_entry( $fh_in ) ) - { - if ( $strand ) { - $key = join "_", $entry->{ "CHR" }, $entry->{ "STRAND" }; - } else { - $key = $entry->{ "CHR" }; - } +sub bed_split_to_files +{ + # Martin A. Hansen, Septermber 2008 - $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.sort" ) if not exists $fh_hash{ $key }; - - bed_put_entry( $entry, $fh_hash{ $key } ); - } + # Given a list of BED files, split these + # into temporary files based on the chromosome + # name. Returns a list of the temporary files. - close $fh_in; + my ( $bed_files, # list of BED files to split + $cols, # number of columns + $tmp_dir, # temporary directory + ) = @_; - map { close $_ } keys %fh_hash; + # Returns a list. - $fh_out = Maasha::Common::write_open( "$tmp_dir/temp.sort" ); + my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files ); - foreach $part_file ( sort keys %fh_hash ) + foreach $bed_file ( @{ $bed_files } ) { - $entries = bed_get_entries( "$tmp_dir/$part_file.sort" ); + $fh_in = Maasha::Common::read_open( $bed_file ); - @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries }; - - map { bed_put_entry( $_, $fh_out ) } @{ $entries }; + while ( $entry = bed_entry_get_array( $fh_in, $cols ) ) + { + $key = $entry->[ CHR ]; + + $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key }; + + bed_entry_put_array( $entry, $fh_hash{ $key } ); + } - unlink "$tmp_dir/$part_file.sort"; + close $fh_in; } - close $fh_out; + foreach $key ( sort keys %fh_hash ) + { + push @tmp_files, "$tmp_dir/$key.temp"; + + close $fh_hash{ $key }; + } - rename "$tmp_dir/temp.sort", $file; + return wantarray ? @tmp_files : \@tmp_files; }