fixed rename bug

[biopieces.git] / code_perl / Maasha / UCSC.pm
diff --git a/code_perl/Maasha/UCSC.pm b/code_perl/Maasha/UCSC.pm

index 1521742e09c741f8a17086bbab16bf4dc08fe319..01cb884c7903a3ea62ee8222a5349d97be1101c4 100644 (file)
--- a/code_perl/Maasha/UCSC.pm
+++ b/code_perl/Maasha/UCSC.pm
@@ -44,6 +44,19 @@ use constant {
      CHR_END      => 2,
      INDEX_BEG    => 3,
      INDEX_LEN    => 4,
+
+    CHR        => 0,
+    CHR_BEG    => 1,
+    CHR_END    => 2,
+    Q_ID       => 3,
+    SCORE      => 4,
+    STRAND     => 5,
+    THICK_BEG  => 6,
+    THICK_END  => 7,
+    ITEMRGB    => 8,
+    BLOCKCOUNT => 9,
+    BLOCKSIZES => 10,
+    Q_BEGS     => 11,
  };
  
  @ISA = qw( Exporter );
@@ -57,6 +70,45 @@ my $TIME = gettimeofday();
  # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
  
  
+sub bed_entry_get_array
+{
+    # Martin A. Hansen, September 2008.
+
+    # Reads a BED entry given a filehandle.
+
+    # This is a new _faster_ BED entry parser that
+    # uses arrays and not hashrefs.
+
+    # IMPORTANT! This function does not correct the
+    # BED_END position that is kept the way UCSC
+    # does.
+
+    my ( $fh,     # file handle
+         $cols,   # columns to read - OPTIONAL (3,4,5,6, or 12)
+       ) = @_;
+
+    # Returns a list.
+
+    my ( $line, @entry );
+
+    $line = <$fh>;
+
+    $line =~ tr/\n\r//d;    # some people have carriage returns in their BED files -> Grrrr
+
+    return if not defined $line;
+
+    if ( not defined $cols ) {
+        $cols = 1 + $line =~ tr/\t//;
+    }
+
+    @entry = split "\t", $line, $cols + 1;
+
+    pop @entry if scalar @entry > $cols;
+
+    return wantarray ? @entry : \@entry;
+}
+
+
  sub bed_get_entry
  {
      # Martin A. Hansen, December 2007.
@@ -176,6 +228,33 @@ sub bed_get_entries
  }
  
  
+sub bed_entry_put_array
+{
+    # Martin A. Hansen, Septermber 2008.
+
+    # Writes a BED entry array to file.
+
+    # IMPORTANT! This function does not correct the
+    # BED_END position that is assumed to be in the
+    # UCSC positions scheme.
+
+    my ( $record,   # list
+         $fh,       # file handle                   - OPTIONAL
+         $cols,     # number of columns in BED file - OPTIONAL
+       ) = @_;
+
+    # Returns nothing.
+
+    $fh = \*STDOUT if not defined $fh;
+
+    if ( defined $cols ) {
+        print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
+    } else {
+        print $fh join( "\t", @{ $record } ), "\n";
+    }
+}
+
+
  sub bed_put_entry
  {
      # Martin A. Hansen, Septermber 2007.
@@ -346,55 +425,66 @@ sub bed_analyze
  
  sub bed_sort
  {
-    # Martin A. Hansen, March 2008.
+    # Martin A. Hansen, Septermber 2008
  
-    # Sort a potential huge BED file according to
-    # CHR, CHR_BEG and optionally STRAND.
+    # Sorts a BED file using the c program
+    # "bed_sort" specifing a sort mode:
  
-    my ( $tmp_dir,   # temporary directory used for sorting
-         $file,      # BED file to sort
-         $strand,    # flag to sort on strand - OPTIONAL
-       ) = @_;
+    # 1: chr AND chr_beg.
+    # 2: chr AND strand AND chr_beg.
+    # 3: chr_beg.
+    # 4: strand AND chr_beg.
  
-    # Returns nothing.
+    my ( $bed_file,    # BED file to sort
+         $sort_mode,   # See above.
+         $cols,        # Number of columns in BED file
+       ) = @_;
  
-    my ( $fh_in, $key, $fh_out, %fh_hash, $part_file, $entry, $entries );
+    &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
+}
  
-    $fh_in = Maasha::Common::read_open( $file );
  
-    while ( $entry = bed_get_entry( $fh_in ) )
-    {
-        if ( $strand ) {
-            $key = join "_", $entry->{ "CHR" }, $entry->{ "STRAND" };
-        } else {
-            $key = $entry->{ "CHR" };
-        }
+sub bed_split_to_files
+{
+    # Martin A. Hansen, Septermber 2008
  
-        $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.sort" ) if not exists $fh_hash{ $key };
-        
-        bed_put_entry( $entry, $fh_hash{ $key } );
-    }
+    # Given a list of BED files, split these
+    # into temporary files based on the chromosome
+    # name. Returns a list of the temporary files.
  
-    close $fh_in;
+    my ( $bed_files,   # list of BED files to split
+         $cols,        # number of columns
+         $tmp_dir,     # temporary directory
+       ) = @_;
  
-    map { close $_ } keys %fh_hash;
+    # Returns a list.
  
-    $fh_out = Maasha::Common::write_open( "$tmp_dir/temp.sort" );
+    my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
  
-    foreach $part_file ( sort keys %fh_hash )
+    foreach $bed_file ( @{ $bed_files } )
      {
-        $entries = bed_get_entries( "$tmp_dir/$part_file.sort" );
+        $fh_in = Maasha::Common::read_open( $bed_file );
  
-        @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
-    
-        map { bed_put_entry( $_, $fh_out ) } @{ $entries };
+        while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
+        {
+            $key = $entry->[ CHR ];
+
+            $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
+            
+            bed_entry_put_array( $entry, $fh_hash{ $key } );
+        }
  
-        unlink "$tmp_dir/$part_file.sort";
+        close $fh_in;
      }
  
-    close $fh_out;
+    foreach $key ( sort keys %fh_hash )
+    {
+        push @tmp_files, "$tmp_dir/$key.temp";
+    
+        close $fh_hash{ $key };
+    }
  
-    rename "$tmp_dir/temp.sort", $file;
+    return wantarray ? @tmp_files : \@tmp_files;
  }
  
  
@@ -599,11 +689,68 @@ CREATE TABLE $table (
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  
  
+sub psl_get_entry
+{
+    # Martin A. Hansen, August 2008.
+
+    # Reads PSL next entry from a PSL file and returns a record.
+
+    my ( $fh,   # file handle of PSL filefull path to PSL file
+       ) = @_;
+
+    # Returns hashref.
+
+    my ( $line, @fields, %record );
+
+    while ( $line = <$fh> )
+    {
+        chomp $line;
+
+        @fields = split "\t", $line;
+
+        if ( scalar @fields == 21 )
+        {
+            %record = (
+                REC_TYPE    => "PSL",
+                MATCHES     => $fields[ 0 ],
+                MISMATCHES  => $fields[ 1 ],
+                REPMATCHES  => $fields[ 2 ],
+                NCOUNT      => $fields[ 3 ],
+                QNUMINSERT  => $fields[ 4 ],
+                QBASEINSERT => $fields[ 5 ],
+                SNUMINSERT  => $fields[ 6 ],
+                SBASEINSERT => $fields[ 7 ],
+                STRAND      => $fields[ 8 ],
+                Q_ID        => $fields[ 9 ],
+                Q_LEN       => $fields[ 10 ],
+                Q_BEG       => $fields[ 11 ],
+                Q_END       => $fields[ 12 ] - 1,
+                S_ID        => $fields[ 13 ],
+                S_LEN       => $fields[ 14 ],
+                S_BEG       => $fields[ 15 ],
+                S_END       => $fields[ 16 ] - 1,
+                BLOCKCOUNT  => $fields[ 17 ],
+                BLOCKSIZES  => $fields[ 18 ],
+                Q_BEGS      => $fields[ 19 ],
+                S_BEGS      => $fields[ 20 ],
+            );
+
+            $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
+            $record{ "SPAN" }  = $fields[ 12 ] - $fields[ 11 ];
+    
+            return wantarray ? %record : \%record;
+        }
+    }
+
+    return undef;
+}
+
+
  sub psl_get_entries
  {
      # Martin A. Hansen, February 2008.
  
-    # Reads PSL entries and returns a record.
+    # Reads PSL entries and returns a list of records.
  
      my ( $path,   # full path to PSL file
         ) = @_;
@@ -965,7 +1112,7 @@ sub fixedstep_index_retrieve
  }
  
  
-sub fixedStep_index_lookup
+sub fixedstep_index_lookup
  {
      # Martin A. Hansen, January 2008.