#include "list.h"
#include "ucsc.h"
+static void usage()
+{
+ fprintf( stderr,
+ "\n"
+ "bed_sort - sorts a BED file.\n"
+ "\n"
+ "Usage: bed_sort [options] <BED file>\n"
+ "\n"
+ "Options:\n"
+ " [-s <int> | --sort <int>] # 1: chr AND chr_beg.\n"
+ " # 2: chr AND strand AND chr_beg.\n"
+ " # 3: chr_beg.\n"
+ " # 4: strand AND chr_beg.\n"
+ " [-c <int> | --cols <int>] # Number of columns to read (default all).\n"
+ " [-d <dir> | --dir <dir> ] # Directory to use for file bound sorting.\n"
+ "\n"
+ "Examples:\n"
+ " bed_sort test.bed > test.bed.sort\n"
+ "\n"
+ );
+
+ exit( EXIT_FAILURE );
+}
+
+
+static struct option longopts[] = {
+ { "sort", required_argument, NULL, 's' },
+ { "cols", required_argument, NULL, 'c' },
+ { "dir", required_argument, NULL, 'd' },
+ { NULL, 0, NULL, 0 }
+};
+
int main( int argc, char *argv[] )
{
+ int opt = 0;
+ int sort = 1;
+ int cols = 0;
+ char *dir = NULL;
char *file = NULL;
list_sl *entries = NULL;
- file = argv[ 1 ];
+ while ( ( opt = getopt_long( argc, argv, "n:l", longopts, NULL ) ) != -1 )
+ {
+ switch ( opt ) {
+ case 's': sort = strtol( optarg, NULL, 0 ); break;
+ case 'c': cols = strtol( optarg, NULL, 0 ); break;
+ case 'd': dir = optarg; break;
+ default: break;
+ }
+ }
+
+ printf( "sort: %d cols: %d dir: %s\n", sort, cols, dir );
+
+ argc -= optind;
+ argv += optind;
+
+ if ( sort < 1 || sort > 4 )
+ {
+ fprintf( stderr, "ERROR: argument to --sort must be 1, 2, 3 or 4 - not: %d\n", sort );
+ abort();
+ }
+
+ if ( cols != 0 && cols != 3 && cols != 4 && cols != 5 && cols != 6 && cols != 12 )
+ {
+ fprintf( stderr, "ERROR: argument to --cols must be 3, 4, 5, 6 or 12 - not: %d\n", cols );
+ abort();
+ }
+
+ if ( ( sort == 2 || sort == 4 ) && ( cols > 0 && cols < 6 ) )
+ {
+ fprintf( stderr, "ERROR: cannot sort on strand with cols (%d) less than 6\n", cols );
+ abort();
+ }
+
+ if ( dir != NULL )
+ {
+ fprintf( stderr, "ERROR: directory: %s does not exists\n", dir );
+ abort();
+ }
+
+ if ( argc < 1 ) {
+ usage();
+ }
+
+ file = argv[ argc - 1 ];
- entries = bed_entries_get( file, 3 );
+ entries = bed_entries_get( file, cols );
- list_sl_sort( &entries, cmp_bed_sort_chr_beg );
+ switch ( sort )
+ {
+ case 1: list_sl_sort( &entries, cmp_bed_sort_chr_beg ); break;
+ case 2: list_sl_sort( &entries, cmp_bed_sort_chr_strand_beg ); break;
+ case 3: list_sl_sort( &entries, cmp_bed_sort_beg ); break;
+ case 4: list_sl_sort( &entries, cmp_bed_sort_strand_beg ); break;
+ default: break;
+ }
- bed_entries_put( entries, 3 );
+ bed_entries_put( entries, cols );
return EXIT_SUCCESS;
}
#include <math.h>
#include <assert.h>
#include <errno.h>
+#include <getopt.h>
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef long long llong;
-typedef char bool;
+typedef char boolean;
+#ifndef bool
+#define bool boolean
+#endif
+
+#ifndef TRUE
#define TRUE 1
+#endif
+
+#ifndef FALSE
#define FALSE 0
+#endif
/* Macros for determining min or max of two given values. */
#define MAX( a, b ) a < b ? b : a
/* Neat debug macro. */
#define DEBUG_EXIT 0
+
+#ifndef die
#define die assert( DEBUG_EXIT )
+#endif
/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MISC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/
+/* Function that prints "pong" to stderr. */
+void maasha_ping();
+
/* Return a binary number as a string of 1's and 0's. */
char *bits2string( uint bin );
#define BED_BLOCKSIZES_MAX 256
#define BED_QBEGS_MAX 256
+/* Structure of a BED entry with the 12 BED elements. */
+/* http://genome.ucsc.edu/FAQ/FAQformat#format1 */
struct _bed_entry
{
- int cols;
- char *chr;
- uint chr_beg;
- uint chr_end;
- char *q_id;
- int score;
- char strand;
- uint thick_beg;
- uint thick_end;
- char *itemrgb;
- uint blockcount;
- char *blocksizes;
- char *q_begs;
+ int cols; /* Number of BED elements used. */
+ char *chr; /* Chromosome name. */
+ uint chr_beg; /* Chromosome begin position. */
+ uint chr_end; /* Chromosome end position. */
+ char *q_id; /* Query ID. */
+ int score; /* Score. */
+ char strand; /* Strand. */
+ uint thick_beg; /* Begin position of thick drawing. */
+ uint thick_end; /* End position of thick drawing. */
+ char *itemrgb; /* RGB color (255,255,255).*/
+ uint blockcount; /* Number of blocks (exons). */
+ char *blocksizes; /* Comma separated string of blocks sizes. */
+ char *q_begs; /* Comma separated string of block begins. */
};
typedef struct _bed_entry bed_entry;
+/* Returns a new BED entry with memory allocated for */
+/* a given number of columns. */
bed_entry *bed_entry_new( const int cols );
+
+/* Free memory for a BED entry. */
+void bed_entry_destroy( bed_entry *entry );
+
+/* Get next BED entry of a given number of columns from a file pointer. */
bed_entry *bed_entry_get( FILE *fp, const int cols );
+
+/* Get a singly linked list with all BED entries (of a given number of coluns */
+/* from a specified file. */
list_sl *bed_entries_get( char *path, const int cols );
+
+/* Output a given number of columns from a BED entry to stdout. */
void bed_entry_put( bed_entry *entry, int cols );
+
+/* Output a given number of columns from all BED entries */
+/* in a singly linked list. */
void bed_entries_put( list_sl *entries, int cols );
+
+/* Free memory for all BED entries and list nodes. */
+void bed_entries_destroy( list_sl **entries_ppt );
+
+/* Given a path to a BED file, read the given number of cols */
+/* according to the begin position. The result is written to stdout. */
+void bed_file_sort_beg( char *path, int cols );
+
+/* Given a path to a BED file, read the given number of cols */
+/* according to the strand AND begin position. The result is written to stdout. */
+void bed_file_sort_strand_beg( char *path, int cols );
+
+/* Given a path to a BED file, read the given number of cols */
+/* according to the chromosome AND begin position. The result is written to stdout. */
+void bed_file_sort_chr_beg( char *path, int cols );
+
+/* Given a path to a BED file, read the given number of cols */
+/* according to the chromosome AND strand AND begin position. The result is written to stdout. */
+void bed_file_sort_chr_strand_beg( char *path, int cols );
+
+/* Compare function for sorting a singly linked list of BED entries */
+/* according to begin position. */
int cmp_bed_sort_beg( const void *a, const void *b );
+
+/* Compare function for sorting a singly linked list of BED entries */
+/* according to strand AND begin position. */
+int cmp_bed_sort_strand_beg( const void *a, const void *b );
+
+/* Compare function for sorting a singly linked list of BED entries */
+/* according to chromosome name AND begin position. */
int cmp_bed_sort_chr_beg( const void *a, const void *b );
+
+/* Compare function for sorting a singly linked list of BED entries */
+/* according to chromosome name AND strand AND begin position. */
int cmp_bed_sort_chr_strand_beg( const void *a, const void *b );
+
+
/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MISC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/
+void maasha_ping()
+{
+ /* Martin A. Hansen, Septermber 2008 */
+
+ /* Function that prints "pong" to stderr. */
+
+ fprintf( stderr, "pong\n" );
+}
+
+
char *bits2string( uint bin )
{
/* Martin A. Hansen, June 2008 */
bed_entry *bed_entry_new( const int cols )
{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Returns a new BED entry with memory allocated for */
+ /* a given number of columns. */
+
bed_entry *entry = mem_get( sizeof( bed_entry ) );
entry->cols = cols;
entry->thick_beg = 0;
entry->thick_end = 0;
entry->itemrgb = mem_get( BED_ITEMRGB_MAX );
- entry->blockcount = 0;;
+ entry->blockcount = 0;
entry->blocksizes = mem_get( BED_BLOCKSIZES_MAX );
entry->q_begs = mem_get( BED_QBEGS_MAX );
}
+void bed_entry_destroy( bed_entry *entry )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Free memory for a BED entry. */
+
+ int cols = entry->cols;
+
+ if ( cols > 6 )
+ {
+ free( entry->itemrgb );
+ free( entry->blocksizes );
+ free( entry->q_begs );
+ free( entry->q_id );
+ free( entry->chr );
+ }
+ else if ( cols > 3 )
+ {
+ free( entry->q_id );
+ free( entry->chr );
+ }
+ else
+ {
+ free( entry->chr );
+ }
+
+ mem_free( &entry );
+}
+
+
bed_entry *bed_entry_get( FILE *fp, int cols )
{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Get next BED entry of a given number of columns from a file pointer. */
+
bed_entry *entry = bed_entry_new( cols );
char buffer[ BED_BUFFER ];
list_sl *bed_entries_get( char *path, const int cols )
{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Get a singly linked list with all BED entries (of a given number of coluns */
+ /* from a specified file. */
+
list_sl *list = list_sl_new();
node_sl *node = node_sl_new();
node_sl *old_node = NULL;
void bed_entry_put( bed_entry *entry, int cols )
{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Output a given number of columns from a BED entry to stdout. */
+
if ( ! cols ) {
cols = entry->cols;
}
void bed_entries_put( list_sl *entries, int cols )
{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Output a given number of columns from all BED entries */
+ /* in a singly linked list. */
+
node_sl *node = NULL;
for ( node = entries->first; node != NULL; node = node->next ) {
}
+void bed_entries_destroy( list_sl **entries_ppt )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Free memory for all BED entries and list nodes. */
+
+ list_sl *entries = *entries_ppt;
+ node_sl *node = NULL;
+ node_sl *next = NULL;
+
+ next = entries->first;
+
+ while ( next != NULL )
+ {
+ node = next;
+
+ bed_entry_destroy( ( bed_entry * ) node->val );
+
+ next = node->next;
+
+ mem_free( &node );
+ }
+
+ mem_free( &entries );
+
+ *entries_ppt = NULL;
+}
+
+
+void bed_file_sort_beg( char *path, int cols )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Given a path to a BED file, read the given number of cols */
+ /* according to the begin position. The result is written to stdout. */
+
+ list_sl *entries = NULL;
+
+ entries = bed_entries_get( path, cols );
+
+ list_sl_sort( &entries, cmp_bed_sort_beg );
+
+ bed_entries_put( entries, cols );
+
+ bed_entries_destroy( &entries );
+}
+
+
+void bed_file_sort_strand_beg( char *path, int cols )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Given a path to a BED file, read the given number of cols */
+ /* according to the strand AND begin position. The result is written to stdout. */
+
+ assert( cols >= 6 );
+
+ list_sl *entries = NULL;
+
+ entries = bed_entries_get( path, cols );
+
+ list_sl_sort( &entries, cmp_bed_sort_strand_beg );
+
+ bed_entries_put( entries, cols );
+
+ bed_entries_destroy( &entries );
+}
+
+
+void bed_file_sort_chr_beg( char *path, int cols )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Given a path to a BED file, read the given number of cols */
+ /* according to the chromosome AND begin position. The result is written to stdout. */
+
+ list_sl *entries = NULL;
+
+ entries = bed_entries_get( path, cols );
+
+ list_sl_sort( &entries, cmp_bed_sort_chr_beg );
+
+ bed_entries_put( entries, cols );
+
+ bed_entries_destroy( &entries );
+}
+
+
+void bed_file_sort_chr_strand_beg( char *path, int cols )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Given a path to a BED file, read the given number of cols */
+ /* according to the chromosome AND strand AND begin position. The result is written to stdout. */
+
+ assert( cols >= 6 );
+
+ list_sl *entries = NULL;
+
+ entries = bed_entries_get( path, cols );
+
+ list_sl_sort( &entries, cmp_bed_sort_chr_strand_beg );
+
+ bed_entries_put( entries, cols );
+
+ bed_entries_destroy( &entries );
+}
+
+
int cmp_bed_sort_beg( const void *a, const void *b )
{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Compare function for sorting a singly linked list of BED entries */
+ /* according to begin position. */
+
node_sl *a_node = *( ( node_sl ** ) a );
node_sl *b_node = *( ( node_sl ** ) b );
}
+int cmp_bed_sort_strand_beg( const void *a, const void *b )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Compare function for sorting a singly linked list of BED entries */
+ /* according to strand AND begin position. */
+
+ node_sl *a_node = *( ( node_sl ** ) a );
+ node_sl *b_node = *( ( node_sl ** ) b );
+
+ bed_entry *a_entry = ( bed_entry * ) a_node->val;
+ bed_entry *b_entry = ( bed_entry * ) b_node->val;
+
+ if ( a_entry->strand < b_entry->strand ) {
+ return -1;
+ } else if ( a_entry->strand > b_entry->strand ) {
+ return 1;
+ } else if ( a_entry->chr_beg < b_entry->chr_beg ) {
+ return -1;
+ } else if ( a_entry->chr_beg > b_entry->chr_beg ) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+
int cmp_bed_sort_chr_beg( const void *a, const void *b )
{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Compare function for sorting a singly linked list of BED entries */
+ /* according to chromosome name AND begin position. */
+
node_sl *a_node = *( ( node_sl ** ) a );
node_sl *b_node = *( ( node_sl ** ) b );
int cmp_bed_sort_chr_strand_beg( const void *a, const void *b )
{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Compare function for sorting a singly linked list of BED entries */
+ /* according to chromosome name AND strand AND begin position. */
+
node_sl *a_node = *( ( node_sl ** ) a );
node_sl *b_node = *( ( node_sl ** ) b );
/* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */
-#include <getopt.h>
#include "common.h"
#include "mem.h"
#include "filesys.h"
static void test_bed_entry_get();
static void test_bed_entries_get();
+static void test_bed_entries_destroy();
static void test_bed_entries_sort();
+static void test_bed_file_sort_beg();
+static void test_bed_file_sort_strand_beg();
+static void test_bed_file_sort_chr_beg();
+static void test_bed_file_sort_chr_strand_beg();
int main()
test_bed_entry_get();
test_bed_entries_get();
+ test_bed_entries_destroy();
test_bed_entries_sort();
+ test_bed_file_sort_beg();
+ test_bed_file_sort_strand_beg();
+ test_bed_file_sort_chr_beg();
+ test_bed_file_sort_chr_strand_beg();
fprintf( stderr, "Done\n\n" );
entries = bed_entries_get( path, 0 );
- bed_entries_put( entries, 0 );
+// bed_entries_put( entries, 0 );
+
+ fprintf( stderr, "OK\n" );
+}
+
+
+void test_bed_entries_destroy()
+{
+ fprintf( stderr, " Testing bed_entries_destroy ... " );
+
+ char *path = "test/test_files/test12.bed";
+ list_sl *entries = NULL;
+
+ entries = bed_entries_get( path, 0 );
+
+ bed_entries_destroy( &entries );
+
+ assert( entries == NULL );
fprintf( stderr, "OK\n" );
}
list_sl_sort( &entries, cmp_bed_sort_chr_beg );
list_sl_sort( &entries, cmp_bed_sort_chr_strand_beg );
- bed_entries_put( entries, 0 );
+// bed_entries_put( entries, 0 );
+
+ fprintf( stderr, "OK\n" );
+}
+
+
+void test_bed_file_sort_beg()
+{
+ fprintf( stderr, " Testing bed_file_sort_beg ... " );
+
+// char *path = "test/test_files/test12.bed";
+
+// bed_file_sort_beg( path, 3 );
+
+ fprintf( stderr, "OK\n" );
+}
+
+
+void test_bed_file_sort_strand_beg()
+{
+ fprintf( stderr, " Testing bed_file_sort_strand_beg ... " );
+
+// char *path = "test/test_files/test12.bed";
+
+// bed_file_sort_strand_beg( path, 6 );
fprintf( stderr, "OK\n" );
}
+
+void test_bed_file_sort_chr_beg()
+{
+ fprintf( stderr, " Testing bed_file_sort_chr_beg ... " );
+
+// char *path = "test/test_files/test12.bed";
+
+// bed_file_sort_chr_beg( path, 6 );
+
+ fprintf( stderr, "OK\n" );
+}
+
+
+void test_bed_file_sort_chr_strand_beg()
+{
+ fprintf( stderr, " Testing bed_file_sort_chr_strand_beg ... " );
+
+// char *path = "test/test_files/test12.bed";
+
+// bed_file_sort_chr_strand_beg( path, 6 );
+
+ fprintf( stderr, "OK\n" );
+}
+
+
--- /dev/null
+package Maasha::C_code;
+
+# Copyright (C) 2008 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+# This modules make all c functions visable to Perl.
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+use strict;
+
+use vars qw ( @ISA @EXPORT );
+
+@ISA = qw( Exporter );
+
+use Inline ( C => Config =>
+ DIRECTORY => $ENV{ 'BP_TMP' },
+# PREFIX => "MAASHA_",
+ ENABLE => AUTOWRAP =>
+ LIBS => "-L$ENV{ 'BP_C' }/Maasha/src/lib -lcommon",
+ INC => "-I$ENV{ 'BP_C' }/Maasha/src/inc",
+ );
+
+use Inline C => << 'END_C';
+
+void maasha_ping();
+
+END_C
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+maasha_ping();
+
+1;
CHR_END => 2,
INDEX_BEG => 3,
INDEX_LEN => 4,
+
+ CHR => 0,
+ CHR_BEG => 1,
+ CHR_END => 2,
+ Q_ID => 3,
+ SCORE => 4,
+ STRAND => 5,
+ THICK_BEG => 6,
+ THICK_END => 7,
+ ITEMRGB => 8,
+ BLOCKCOUNT => 9,
+ BLOCKSIZES => 10,
+ Q_BEGS => 11,
};
@ISA = qw( Exporter );
# http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
+sub bed_entry_get_array
+{
+ # Martin A. Hansen, September 2008.
+
+ # Reads a BED entry given a filehandle.
+
+ # This is a new _faster_ BED entry parser that
+ # uses arrays and not hashrefs.
+
+ # IMPORTANT! This function does not correct the
+ # BED_END position that is kept the way UCSC
+ # does.
+
+ my ( $fh, # file handle
+ $cols, # columns to read - OPTIONAL (3,4,5,6, or 12)
+ ) = @_;
+
+ # Returns a list.
+
+ my ( $line, @entry );
+
+ $line = <$fh>;
+
+ $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
+
+ return if not defined $line;
+
+ if ( not defined $cols ) {
+ $cols = 1 + $line =~ tr/\t//;
+ }
+
+ @entry = split "\t", $line, $cols + 1;
+
+ pop @entry if scalar @entry > $cols;
+
+ return wantarray ? @entry : \@entry;
+}
+
+
sub bed_get_entry
{
# Martin A. Hansen, December 2007.
}
+sub bed_entry_put_array
+{
+ # Martin A. Hansen, Septermber 2008.
+
+ # Writes a BED entry array to file.
+
+ # IMPORTANT! This function does not correct the
+ # BED_END position that is assumed to be in the
+ # UCSC positions scheme.
+
+ my ( $record, # list
+ $fh, # file handle - OPTIONAL
+ $cols, # number of columns in BED file - OPTIONAL
+ ) = @_;
+
+ # Returns nothing.
+
+ $fh = \*STDOUT if not defined $fh;
+
+ if ( defined $cols ) {
+ print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
+ } else {
+ print $fh join( "\t", @{ $record } ), "\n";
+ }
+}
+
+
sub bed_put_entry
{
# Martin A. Hansen, Septermber 2007.
sub bed_sort
{
- # Martin A. Hansen, March 2008.
+ # Martin A. Hansen, Septermber 2008
- # Sort a potential huge BED file according to
- # CHR, CHR_BEG and optionally STRAND.
+ # Sorts a BED file using the c program
+ # "bed_sort" specifing a sort mode:
- my ( $tmp_dir, # temporary directory used for sorting
- $file, # BED file to sort
- $strand, # flag to sort on strand - OPTIONAL
- ) = @_;
+ # 1: chr AND chr_beg.
+ # 2: chr AND strand AND chr_beg.
+ # 3: chr_beg.
+ # 4: strand AND chr_beg.
- # Returns nothing.
+ my ( $bed_file, # BED file to sort
+ $sort_mode, # See above.
+ $cols, # Number of columns in BED file
+ ) = @_;
- my ( $fh_in, $key, $fh_out, %fh_hash, $part_file, $entry, $entries );
+ &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
+}
- $fh_in = Maasha::Common::read_open( $file );
- while ( $entry = bed_get_entry( $fh_in ) )
- {
- if ( $strand ) {
- $key = join "_", $entry->{ "CHR" }, $entry->{ "STRAND" };
- } else {
- $key = $entry->{ "CHR" };
- }
+sub bed_split_to_files
+{
+ # Martin A. Hansen, Septermber 2008
- $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.sort" ) if not exists $fh_hash{ $key };
-
- bed_put_entry( $entry, $fh_hash{ $key } );
- }
+ # Given a list of BED files, split these
+ # into temporary files based on the chromosome
+ # name. Returns a list of the temporary files.
- close $fh_in;
+ my ( $bed_files, # list of BED files to split
+ $cols, # number of columns
+ $tmp_dir, # temporary directory
+ ) = @_;
- map { close $_ } keys %fh_hash;
+ # Returns a list.
- $fh_out = Maasha::Common::write_open( "$tmp_dir/temp.sort" );
+ my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
- foreach $part_file ( sort keys %fh_hash )
+ foreach $bed_file ( @{ $bed_files } )
{
- $entries = bed_get_entries( "$tmp_dir/$part_file.sort" );
+ $fh_in = Maasha::Common::read_open( $bed_file );
- @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
-
- map { bed_put_entry( $_, $fh_out ) } @{ $entries };
+ while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
+ {
+ $key = $entry->[ CHR ];
+
+ $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
+
+ bed_entry_put_array( $entry, $fh_hash{ $key } );
+ }
- unlink "$tmp_dir/$part_file.sort";
+ close $fh_in;
}
- close $fh_out;
+ foreach $key ( sort keys %fh_hash )
+ {
+ push @tmp_files, "$tmp_dir/$key.temp";
+
+ close $fh_hash{ $key };
+ }
- rename "$tmp_dir/temp.sort", $file;
+ return wantarray ? @tmp_files : \@tmp_files;
}