code_perl/Maasha/TwoBit.pm

   1 package Maasha::TwoBit;
   2
   3 # Copyright (C) 2008 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with the 2bit format as described here:
  26 # http://genome.ucsc.edu/FAQ/FAQformat#format7
  27
  28
  29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  30
  31
  32 use warnings;
  33 use strict;
  34 use vars qw( @ISA @EXPORT );
  35
  36 use Data::Dumper;
  37
  38 use Inline ( C => <<'END_C', DIRECTORY => $ENV{ "TMP_DIR" } );
  39
  40 int find_block_beg( char *string, char c, int beg, int len )
  41 {
  42     /* Martin A. Hansen, March 2008 */
  43
  44     /* Given a string and a begin position, locates the next               */
  45     /* position in the string MATCHING a given char.                       */
  46     /* This position is returned. If the char is not found -1 is returned. */
  47
  48     int i;
  49
  50     for ( i = beg; i < len; i++ )
  51     {
  52         if ( string[ i ] == c ) {
  53             return i;
  54         }
  55     }
  56
  57     return -1;
  58 }
  59
  60
  61 int find_block_len( char *string, char c, int beg, int len )
  62 {
  63     /* Martin A. Hansen, March 2008 */
  64
  65     /* Given a string and a begin position, locates the next length of            */
  66     /* a block consisting of a given char. The length of that block is returned.  */
  67
  68     int i;
  69
  70     i = beg;
  71
  72     while ( i < len && string[ i ] == c )
  73     {
  74         i++;
  75     }
  76
  77     return i - beg;
  78 }
  79
  80
  81 char l2n[26] = { 2, 255, 1, 255, 255, 255, 3, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255 };
  82
  83 void dna2bin( char *raw, int size )
  84 {
  85     /* Khisanth from #perl March 2008 */
  86
  87     /* Encodes a DNA string to a bit array */
  88
  89     Inline_Stack_Vars;
  90     unsigned int i = 0;
  91     unsigned char packed_value = 0;
  92     char *packed = malloc( size / 4 );
  93     packed[0] = 0;
  94
  95     for( i = 0; i < size / 4; i++ ) {
  96         packed_value = l2n[ raw[i*4] - 'A' ] << 6
  97             | l2n[ raw[i*4+1] - 'A' ] << 4
  98             | l2n[ raw[i*4+2] - 'A' ] << 2
  99             | l2n[ raw[i*4+3] - 'A' ];
 100         packed[i] = packed_value;
 101     }
 102
 103     Inline_Stack_Reset;
 104     Inline_Stack_Push(sv_2mortal(newSVpvn(packed, size / 4 )));
 105     Inline_Stack_Done;
 106     free(packed);
 107 }
 108
 109
 110 char *conv[256] = {
 111     "TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT",
 112     "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC",
 113     "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA",
 114     "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG",
 115     "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT",
 116     "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC",
 117     "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA",
 118     "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG",
 119     "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT",
 120     "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC",
 121     "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA",
 122     "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG",
 123     "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT",
 124     "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC",
 125     "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA",
 126     "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG",
 127     "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT",
 128     "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC",
 129     "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA",
 130     "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG",
 131     "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT",
 132     "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC",
 133     "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA",
 134     "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG",
 135     "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT",
 136     "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC",
 137     "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA",
 138     "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG",
 139     "GGGT", "GGGC", "GGGA", "GGGG"
 140 };
 141
 142
 143 void bin2dna( char *raw, int size )
 144 {
 145     /* Khisanth from #perl, March 2008 */
 146
 147     /* Converts a bit array to DNA which is returned. */
 148
 149     Inline_Stack_Vars;
 150     char *unpacked = malloc( 4 * size + 1 );
 151
 152     int i = 0;
 153     unsigned char conv_index;
 154     unpacked[0] = 0;
 155
 156     for( i = 0; i < size; i++ ) {
 157         memset( &conv_index, raw[i], 1 );
 158         memcpy( unpacked + i*4, conv[conv_index], 4);
 159     }
 160
 161     Inline_Stack_Reset;
 162     Inline_Stack_Push(sv_2mortal(newSVpvn(unpacked, 4 * size)));
 163     Inline_Stack_Done;
 164     free(unpacked);
 165 }
 166
 167
 168 void bin2dna_old( char *bin, int bin_len )
 169 {
 170     /* Martin A. Hansen, March 2008 */
 171
 172     /* Converts a binary string to DNA which is returned. */
 173
 174     Inline_Stack_Vars;
 175
 176     int i, c;
 177
 178     char *dna = ( char* )( malloc( bin_len / 2 ) );
 179
 180     c = 0;
 181
 182     for ( i = 1; i < bin_len; i += 2 )
 183     {
 184         if ( bin[ i - 1 ] == '1' )
 185         {
 186             if ( bin[ i ] == '1' ) {
 187                 dna[ c ] = 'G';
 188             } else {
 189                 dna[ c ] = 'A';
 190             }
 191         }
 192         else
 193         {
 194             if ( bin[ i ] == '1' ) {
 195                 dna[ c ] = 'C';
 196             } else {
 197                 dna[ c ] = 'T';
 198             }
 199         }
 200
 201         c++;
 202     }
 203
 204     Inline_Stack_Reset;
 205     Inline_Stack_Push( sv_2mortal( newSVpvn( dna, ( bin_len / 2 ) ) ) );
 206     Inline_Stack_Done;
 207
 208     free( dna );
 209 }
 210
 211
 212 void hard_mask( char *seq, int beg, int len, int sub_beg, int sub_len )
 213 {
 214     /* Martin A. Hansen, March 2008 */
 215
 216     /* Hard masks a sequnce in a given interval, which is trimmed, */
 217     /* if it does not match the sequence. */
 218
 219     int i, mask_beg, mask_len;
 220
 221     if ( sub_beg + sub_len >= beg && sub_beg <= beg + len )
 222     {
 223         mask_beg = beg - sub_beg;
 224
 225         if ( mask_beg < 0 ) {
 226             mask_beg = 0;
 227         }
 228
 229         mask_len = len;
 230
 231         if ( sub_len < mask_len ) {
 232             mask_len = sub_len;
 233         }
 234
 235         for ( i = mask_beg; i < mask_beg + mask_len; i++ ) {
 236             seq[ i ] = 'N';
 237         }
 238     }
 239 }
 240
 241
 242 void soft_mask( char *seq, int beg, int len, int sub_beg, int sub_len )
 243 {
 244     /* Martin A. Hansen, March 2008 */
 245
 246     /* Soft masks a sequnce in a given interval, which is trimmed, */
 247     /* if it does not match the sequence. */
 248
 249     int i, mask_beg, mask_len;
 250
 251     if ( sub_beg + sub_len >= beg && sub_beg <= beg + len )
 252     {
 253         mask_beg = beg - sub_beg;
 254
 255         if ( mask_beg < 0 ) {
 256             mask_beg = 0;
 257         }
 258
 259         mask_len = len;
 260
 261         if ( sub_len < mask_len ) {
 262             mask_len = sub_len;
 263         }
 264
 265         for ( i = mask_beg; i < mask_beg + mask_len; i++ ) {
 266             seq[ i ] = seq[ i ] ^ ' ';
 267         }
 268     }
 269 }
 270
 271 END_C
 272
 273
 274 use Maasha::Common;
 275 use Maasha::Fasta;
 276 use Maasha::Seq;
 277
 278 use constant {
 279     SEQ_NAME => 0,
 280     SEQ      => 1,
 281 };
 282
 283 @ISA = qw( Exporter );
 284
 285
 286 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 287
 288
 289 sub twobit_get_TOC
 290 {
 291     # Martin A. Hansen, March 2008.
 292
 293     # Fetches the table of contents (TOC) from a 2bit file.
 294     # The TOC is returned as a list of lists.
 295
 296     # The 2bit format is described here:
 297     # http://genome.ucsc.edu/FAQ/FAQformat#format7
 298
 299     my ( $fh,   # filehandle
 300        ) = @_;
 301
 302     # Returns AoA.
 303
 304     my ( $signature, $version, $seq_count, $reserved, $i, $seq_name_size, $string, $seq_name, $offset, @AoA );
 305
 306     sysseek $fh, 0, 0;
 307
 308     $signature = &unpack_32bit( $fh );
 309     $version   = &unpack_32bit( $fh );
 310     $seq_count = &unpack_32bit( $fh );
 311     $reserved  = &unpack_32bit( $fh );
 312
 313     &Maasha::Common::error( qq(2bit file signature didn't match - inverse bit order?) ) if $signature != 0x1A412743;
 314
 315     for ( $i = 0; $i < $seq_count; $i++ )
 316     {
 317         $seq_name_size = &unpack_8bit( $fh );
 318
 319         sysread $fh, $string, $seq_name_size;
 320
 321         $seq_name = unpack( "A$seq_name_size", $string );
 322
 323         $offset = &unpack_32bit( $fh );
 324
 325         push @AoA, [ $seq_name, $offset ];
 326     }
 327
 328     return wantarray ? @AoA : \@AoA;
 329 }
 330
 331
 332 sub twobit_get_seq
 333 {
 334     # Martin A. Hansen, March 2008.
 335
 336     # Given a filehandle to a 2bit file, gets the sequence
 337     # or subsequence from an 2bit file entry at the given
 338     # offset position.
 339
 340     # The 2bit format is described here:
 341     # http://genome.ucsc.edu/FAQ/FAQformat#format7
 342
 343     my ( $fh,        # filehandle
 344          $offset,    # byte position
 345          $sub_beg,   # begin of subsequence                - OPTIONAL
 346          $sub_len,   # length of subsequence               - OPTIONAL
 347          $mask,      # retrieve soft mask information flag - OPTIONAL
 348        ) = @_;
 349
 350     # Returns a string.
 351
 352     my ( $string, $seq_len, $n_count, $n, @n_begs, @n_sizes, $m_count, $m, @m_begs, @m_sizes, $reserved, $seq );
 353
 354     $sub_beg ||= 0;
 355     $sub_len ||= 9999999999;
 356
 357     sysseek $fh, $offset, 0;
 358
 359     $seq_len = &unpack_32bit( $fh );
 360     $sub_len = $seq_len if $sub_len > $seq_len;
 361
 362     $n_count = &unpack_32bit( $fh );
 363
 364     map { push @n_begs,  &unpack_32bit( $fh ) } 1 .. $n_count;
 365     map { push @n_sizes, &unpack_32bit( $fh ) } 1 .. $n_count;
 366
 367     $m_count = &unpack_32bit( $fh );
 368
 369     map { push @m_begs,  &unpack_32bit( $fh ) } 1 .. $m_count;
 370     map { push @m_sizes, &unpack_32bit( $fh ) } 1 .. $m_count;
 371
 372     $reserved = &unpack_32bit( $fh );
 373
 374     $offset += 4 + 4 + $n_count * 8 + 4 + $m_count * 8 + 4;
 375
 376     $seq = &unpack_dna( $fh, $offset, $sub_beg, $sub_len );
 377
 378     for ( $n = 0; $n < $n_count; $n++ )
 379     {
 380         hard_mask( $seq, $n_begs[ $n ], $n_sizes[ $n ], $sub_beg, $sub_len );
 381
 382         last if $sub_beg + $sub_len < $n_begs[ $n ];
 383     }
 384
 385     if ( $mask )
 386     {
 387         for ( $m = 0; $m < $m_count; $m++ )
 388         {
 389             soft_mask( $seq, $m_begs[ $m ], $m_sizes[ $m ], $sub_beg, $sub_len );
 390
 391             last if $sub_beg + $sub_len < $m_begs[ $m ];
 392         }
 393     }
 394
 395     return $seq;
 396 }
 397
 398
 399 sub unpack_8bit
 400 {
 401     # Martin A. Hansen, March 2008.
 402
 403     # Reads in 8 bits from the given filehandle
 404     # and returns the encoded value.
 405
 406     # NB swap still needs fixing.
 407
 408     my ( $fh,     # filehandle
 409          $swap,   # bit order swap flag  -  OPTIONAL
 410        ) = @_;
 411
 412     # Returns integer.
 413
 414     my ( $string, $val );
 415
 416     sysread $fh, $string, 1;
 417
 418     $val = unpack( "C", $string );
 419
 420     return $val;
 421 }
 422
 423
 424 sub unpack_32bit
 425 {
 426     # Martin A. Hansen, March 2008.
 427
 428     # Reads in 32 bits from the given filehandle
 429     # and returns the encoded value.
 430
 431     my ( $fh,     # filehandle
 432          $swap,   # bit order swap flag  -  OPTIONAL
 433        ) = @_;
 434
 435     # Returns integer.
 436
 437     my ( $string, $val );
 438
 439     sysread $fh, $string, 4;
 440
 441     if ( $swap ) {
 442         $val = unpack( "N", $string );
 443     } else {
 444         $val = unpack( "V", $string );
 445     }
 446
 447     return $val;
 448 }
 449
 450
 451 sub unpack_dna
 452 {
 453     # Martin A. Hansen, March 2008.
 454
 455     # Unpacks the DNA beginning at the given filehandle.
 456     # The DNA packed to two bits per base, where the first
 457     # base is in the most significant 2-bit byte; the last
 458     # base is in the least significant 2 bits. The packed
 459     # DNA field is padded with 0 bits as necessary to take
 460     # an even multiple of 32 bits in the file.
 461
 462     # NB swap still needs fixing.
 463
 464     my ( $fh,       # filehandle
 465          $offset,   # file offset
 466          $beg,      # sequence beg
 467          $len,      # sequence length
 468          $swap,     # bit order swap flag  -  OPTIONAL
 469        ) = @_;
 470
 471     # Returns a string.
 472
 473     my ( $bin, $bin_beg, $bin_len, $dna, $bin_diff, $len_diff );
 474
 475     $bin_beg = int( $beg / 4 );
 476     $bin_beg-- if $beg % 4;
 477     $bin_beg = 0 if $bin_beg < 0;
 478
 479     $bin_len = int( $len / 4 );
 480     $bin_len++ if $len % 4;
 481
 482     sysseek $fh, $offset + $bin_beg, 0;
 483     sysread $fh, $bin, $bin_len;
 484
 485     $dna = bin2dna( $bin, $bin_len );
 486
 487     $bin_diff = $beg - $bin_beg * 4;
 488     $len_diff = $bin_len * 4 - $len;
 489
 490     $dna =~ s/^.{$bin_diff}// if $bin_diff;
 491     $dna =~ s/.{$len_diff}$// if $len_diff;
 492
 493     return $dna;
 494 }
 495
 496
 497 sub fasta2twobit
 498 {
 499     # Martin A. Hansen, March 2008.
 500
 501     # Converts a FASTA file to 2bit format.
 502
 503     my ( $fh_in,    # file handle to FASTA file
 504          $fh_out,   # output file handle      -  OPTIONAL
 505          $mask,     # preserver soft masking  -  OPTIONAL
 506        ) = @_;
 507
 508     my ( $seq_offset, $offset, $entry, $mask_index, $seq_len, $seq_name_len, $pack_len, $rec_len, $index, $bin, $seq );
 509
 510     $fh_out = \*STDOUT if not $fh_out;
 511
 512     # ---- Creating content index ----
 513
 514     $seq_offset = 0;    # offset for reading sequence from FASTA file
 515     $offset     = 16;   # offset starting after header line which is 16 bytes
 516
 517     while ( $entry = &Maasha::Fasta::get_entry( $fh_in ) )
 518     {
 519         $seq_len      = length $entry->[ SEQ ];
 520         $seq_name_len = length $entry->[ SEQ_NAME ];
 521
 522         $mask_index = &mask_locate( $entry->[ SEQ ], $mask );
 523
 524         $pack_len = ( $seq_len + ( 4 - ( $seq_len ) % 4 ) ) / 4;
 525
 526         $rec_len = (
 527               4                                # Sequence length
 528             + 4                                # N blocks
 529             + 4 * $mask_index->{ "N_COUNT" }   # N begins
 530             + 4 * $mask_index->{ "N_COUNT" }   # N lengths
 531             + 4                                # M blocks
 532             + 4 * $mask_index->{ "M_COUNT" }   # M begins
 533             + 4 * $mask_index->{ "M_COUNT" }   # M lengths
 534             + 4                                # reserved
 535             + $pack_len                        # Packed DNA - 32 bit multiplum of 2 bit/base sequence in bytes
 536         );
 537
 538         push @{ $index }, {
 539             SEQ_NAME     => $entry->[ SEQ_NAME ],
 540             SEQ_NAME_LEN => $seq_name_len,
 541             SEQ_BEG      => $seq_offset + $seq_name_len + 2,
 542             SEQ_LEN      => $seq_len,
 543             N_COUNT      => $mask_index->{ "N_COUNT" },
 544             N_BEGS       => $mask_index->{ "N_BEGS" },
 545             N_LENS       => $mask_index->{ "N_LENS" },
 546             M_COUNT      => $mask_index->{ "M_COUNT" },
 547             M_BEGS       => $mask_index->{ "M_BEGS" },
 548             M_LENS       => $mask_index->{ "M_LENS" },
 549             REC_LEN      => $rec_len,
 550         };
 551
 552         $offset += (
 553             + 1              # 1 byte SEQ_NAME size
 554             + $seq_name_len  # SEQ_NAME depending on SEQ_NAME size
 555             + 4              # 32 bit offset position of sequence record
 556         );
 557
 558         $seq_offset += $seq_name_len + 2 + $seq_len + 1;
 559     }
 560
 561     # ---- Printing Header ----
 562
 563     $bin = pack( "V4", oct "0x1A412743", "0", scalar @{ $index }, 0 ); # signature, version, sequence count and reserved
 564
 565     print $fh_out $bin;
 566
 567     # ---- Printing TOC ----
 568
 569     undef $bin;
 570
 571     foreach $entry ( @{ $index } )
 572     {
 573         $bin .= pack( "C", $entry->{ "SEQ_NAME_LEN" } );                           # 1 byte SEQ_NAME size
 574         $bin .= pack( qq(A$entry->{ "SEQ_NAME_LEN" }), $entry->{ "SEQ_NAME" } );   # SEQ_NAME depending on SEQ_NAME size
 575         $bin .= pack( "V", $offset );                                              # 32 bit offset position of sequence record
 576
 577         $offset += $entry->{ "REC_LEN" };
 578     }
 579
 580     print $fh_out $bin;
 581
 582     # ---- Printing Records ----
 583
 584     foreach $entry ( @{ $index } )
 585     {
 586         undef $bin;
 587
 588         $bin .= pack( "V", $entry->{ "SEQ_LEN" } );
 589         $bin .= pack( "V", $entry->{ "N_COUNT" } );
 590
 591         map { $bin .= pack( "V", $_ ) } @{ $entry->{ "N_BEGS" } };
 592         map { $bin .= pack( "V", $_ ) } @{ $entry->{ "N_LENS" } };
 593
 594         $bin .= pack( "V", $entry->{ "M_COUNT" } );
 595
 596         map { $bin .= pack( "V", $_ ) } @{ $entry->{ "M_BEGS" } };
 597         map { $bin .= pack( "V", $_ ) } @{ $entry->{ "M_LENS" } };
 598
 599         $bin .= pack( "V", 0 );
 600
 601         sysseek $fh_in, $entry->{ "SEQ_BEG" }, 0;
 602         sysread $fh_in, $seq, $entry->{ "SEQ_LEN" };
 603
 604         $seq = uc $seq;
 605         $seq =~ tr/RYWSMKHDVBN/TTTTTTTTTTT/;
 606
 607         $bin .= &pack_dna( $seq );
 608
 609         print $fh_out $bin;
 610     }
 611
 612     close $fh_in;
 613     close $fh_out;
 614 }
 615
 616
 617 sub pack_dna
 618 {
 619     # Martin A. Hansen, March 2008.
 620
 621     # Packs a DNA sequence into a bit array, The DNA packed to two bits per base,
 622     # represented as so: T - 00, C - 01, A - 10, G - 11. The first base is
 623     # in the most significant 2-bit byte; the last base is in the least significant
 624     # 2 bits. For example, the sequence TCAG is represented as 00011011.
 625     # The packedDna field is padded with 0 bits as necessary to take an even
 626     # multiple of 32 bits in the file.
 627
 628     my ( $dna,   # dna string to pack
 629        ) = @_;
 630
 631     # Returns bit array
 632
 633     my ( $bin );
 634
 635     $dna .= "T" x ( 4 - ( length( $dna ) % 4 ) );
 636
 637     $bin = dna2bin( $dna, length $dna );
 638
 639     return $bin;
 640 }
 641
 642
 643 sub mask_locate
 644 {
 645     # Martin A. Hansen, March 2008.
 646
 647     # Locate N-blocks and M-blocks in a given sequence.
 648     # These blocks a continously streches of Ns and Ms in a string,
 649     # and the begins and lenghts of these blocks are saved in a
 650     # hash along with the count of each block type.
 651
 652     my ( $seq,   # Sequence
 653          $mask,  # preserve soft masking flag  -  OPTIONAL
 654        ) = @_;
 655
 656     # Returns a hash.
 657
 658     my ( $n_mask, $m_mask, $seq_len, $pos, $n_beg, $n_len, $m_beg, $m_len, @n_begs, @n_lens, @m_begs, @m_lens, %mask_hash );
 659
 660     $seq =~ tr/atcgunRYWSMKHDVBrywsmkhdvb/MMMMMNNNNNNNNNNNNNNNNNNNNN/;
 661
 662     $n_mask = 1;   # always mask Ns.
 663     $m_mask = $mask || 0;
 664
 665     $seq_len = length $seq;
 666
 667     $pos = 0;
 668
 669     while ( $n_mask or $m_mask )
 670     {
 671         if ( $n_mask )
 672         {
 673             $n_beg = find_block_beg( $seq, "N", $pos, $seq_len );
 674
 675             $n_mask = 0 if $n_beg < 0;
 676         }
 677
 678         if ( $m_mask )
 679         {
 680             $m_beg = find_block_beg( $seq, "M", $pos, $seq_len );
 681
 682             $m_mask = 0 if $m_beg < 0;
 683         }
 684
 685         if ( $n_mask and $m_mask )
 686         {
 687             if ( $n_beg < $m_beg )
 688             {
 689                 $n_len = find_block_len( $seq, "N", $n_beg, $seq_len );
 690
 691                 push @n_begs, $n_beg;
 692                 push @n_lens, $n_len;
 693
 694                 $pos = $n_beg + $n_len;
 695             }
 696             else
 697             {
 698                 $m_len = find_block_len( $seq, "M", $m_beg, $seq_len );
 699
 700                 push @m_begs, $m_beg;
 701                 push @m_lens, $m_len;
 702
 703                 $pos = $m_beg + $m_len;
 704             }
 705         }
 706         elsif ( $n_mask )
 707         {
 708             $n_len = find_block_len( $seq, "N", $n_beg, $seq_len );
 709
 710             push @n_begs, $n_beg;
 711             push @n_lens, $n_len;
 712
 713             $pos = $n_beg + $n_len;
 714         }
 715         elsif ( $m_mask )
 716         {
 717             $m_len = find_block_len( $seq, "M", $m_beg, $seq_len );
 718
 719             push @m_begs, $m_beg;
 720             push @m_lens, $m_len;
 721
 722             $pos = $m_beg + $m_len;
 723         }
 724         else
 725         {
 726             last;
 727         }
 728     }
 729
 730     %mask_hash = (
 731         N_COUNT      => scalar @n_begs,
 732         N_BEGS       => [ @n_begs ],
 733         N_LENS       => [ @n_lens ],
 734         M_COUNT      => scalar @m_begs,
 735         M_BEGS       => [ @m_begs ],
 736         M_LENS       => [ @m_lens ],
 737     );
 738
 739     return wantarray ? %mask_hash : \%mask_hash;
 740 }
 741
 742
 743 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 744
 745
 746 1;