code_perl/Maasha/TwoBit.pm

   1 package Maasha::TwoBit;
   2
   3 # Copyright (C) 2008 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Stuff for interacting with the 2bit format as described here:
  26 # http://genome.ucsc.edu/FAQ/FAQformat#format7
  27
  28
  29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  30
  31
  32 use warnings;
  33 use warnings;
  34 use strict;
  35 use vars qw( @ISA @EXPORT );
  36
  37 use Data::Dumper;
  38
  39 use Inline ( C => <<'END_C', DIRECTORY => $ENV{ "BP_TMP" } );
  40
  41 int find_block_beg( char *string, char c, int beg, int len )
  42 {
  43     /* Martin A. Hansen, March 2008 */
  44
  45     /* Given a string and a begin position, locates the next               */
  46     /* position in the string MATCHING a given char.                       */
  47     /* This position is returned. If the char is not found -1 is returned. */
  48
  49     int i;
  50
  51     for ( i = beg; i < len; i++ )
  52     {
  53         if ( string[ i ] == c ) {
  54             return i;
  55         }
  56     }
  57
  58     return -1;
  59 }
  60
  61
  62 int find_block_len( char *string, char c, int beg, int len )
  63 {
  64     /* Martin A. Hansen, March 2008 */
  65
  66     /* Given a string and a begin position, locates the next length of            */
  67     /* a block consisting of a given char. The length of that block is returned.  */
  68
  69     int i;
  70
  71     i = beg;
  72
  73     while ( i < len && string[ i ] == c )
  74     {
  75         i++;
  76     }
  77
  78     return i - beg;
  79 }
  80
  81
  82 char l2n[26] = { 2, 255, 1, 255, 255, 255, 3, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255 };
  83
  84 void dna2bin( char *raw, int size )
  85 {
  86     /* Khisanth from #perl March 2008 */
  87
  88     /* Encodes a DNA string to a bit array */
  89
  90     Inline_Stack_Vars;
  91     unsigned int i = 0;
  92     unsigned char packed_value = 0;
  93     char *packed = malloc( size / 4 );
  94     packed[0] = 0;
  95
  96     for( i = 0; i < size / 4; i++ ) {
  97         packed_value = l2n[ raw[i*4] - 'A' ] << 6
  98             | l2n[ raw[i*4+1] - 'A' ] << 4
  99             | l2n[ raw[i*4+2] - 'A' ] << 2
 100             | l2n[ raw[i*4+3] - 'A' ];
 101         packed[i] = packed_value;
 102     }
 103
 104     Inline_Stack_Reset;
 105     Inline_Stack_Push(sv_2mortal(newSVpvn(packed, size / 4 )));
 106     Inline_Stack_Done;
 107     free(packed);
 108 }
 109
 110
 111 char *conv[256] = {
 112     "TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT",
 113     "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC",
 114     "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA",
 115     "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG",
 116     "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT",
 117     "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC",
 118     "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA",
 119     "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG",
 120     "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT",
 121     "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC",
 122     "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA",
 123     "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG",
 124     "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT",
 125     "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC",
 126     "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA",
 127     "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG",
 128     "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT",
 129     "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC",
 130     "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA",
 131     "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG",
 132     "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT",
 133     "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC",
 134     "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA",
 135     "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG",
 136     "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT",
 137     "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC",
 138     "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA",
 139     "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG",
 140     "GGGT", "GGGC", "GGGA", "GGGG"
 141 };
 142
 143
 144 void bin2dna( char *raw, int size )
 145 {
 146     /* Khisanth from #perl, March 2008 */
 147
 148     /* Converts a bit array to DNA which is returned. */
 149
 150     Inline_Stack_Vars;
 151     char *unpacked = malloc( 4 * size + 1 );
 152
 153     int i = 0;
 154     unsigned char conv_index;
 155     unpacked[0] = 0;
 156
 157     for( i = 0; i < size; i++ ) {
 158         memset( &conv_index, raw[i], 1 );
 159         memcpy( unpacked + i*4, conv[conv_index], 4);
 160     }
 161
 162     Inline_Stack_Reset;
 163     Inline_Stack_Push(sv_2mortal(newSVpvn(unpacked, 4 * size)));
 164     Inline_Stack_Done;
 165     free(unpacked);
 166 }
 167
 168
 169 void bin2dna_old( char *bin, int bin_len )
 170 {
 171     /* Martin A. Hansen, March 2008 */
 172
 173     /* Converts a binary string to DNA which is returned. */
 174
 175     Inline_Stack_Vars;
 176
 177     int i, c;
 178
 179     char *dna = ( char* )( malloc( bin_len / 2 ) );
 180
 181     c = 0;
 182
 183     for ( i = 1; i < bin_len; i += 2 )
 184     {
 185         if ( bin[ i - 1 ] == '1' )
 186         {
 187             if ( bin[ i ] == '1' ) {
 188                 dna[ c ] = 'G';
 189             } else {
 190                 dna[ c ] = 'A';
 191             }
 192         }
 193         else
 194         {
 195             if ( bin[ i ] == '1' ) {
 196                 dna[ c ] = 'C';
 197             } else {
 198                 dna[ c ] = 'T';
 199             }
 200         }
 201
 202         c++;
 203     }
 204
 205     Inline_Stack_Reset;
 206     Inline_Stack_Push( sv_2mortal( newSVpvn( dna, ( bin_len / 2 ) ) ) );
 207     Inline_Stack_Done;
 208
 209     free( dna );
 210 }
 211
 212
 213 void hard_mask( char *seq, int beg, int len, int sub_beg, int sub_len )
 214 {
 215     /* Martin A. Hansen, March 2008 */
 216
 217     /* Hard masks a sequnce in a given interval, which is trimmed, */
 218     /* if it does not match the sequence. */
 219
 220     int i, mask_beg, mask_len;
 221
 222     if ( sub_beg + sub_len >= beg && sub_beg <= beg + len )
 223     {
 224         mask_beg = beg - sub_beg;
 225
 226         if ( mask_beg < 0 ) {
 227             mask_beg = 0;
 228         }
 229
 230         mask_len = len;
 231
 232         if ( sub_len < mask_len ) {
 233             mask_len = sub_len;
 234         }
 235
 236         for ( i = mask_beg; i < mask_beg + mask_len; i++ ) {
 237             seq[ i ] = 'N';
 238         }
 239     }
 240 }
 241
 242
 243 void soft_mask( char *seq, int beg, int len, int sub_beg, int sub_len )
 244 {
 245     /* Martin A. Hansen, March 2008 */
 246
 247     /* Soft masks a sequnce in a given interval, which is trimmed, */
 248     /* if it does not match the sequence. */
 249
 250     int i, mask_beg, mask_len;
 251
 252     if ( sub_beg + sub_len >= beg && sub_beg <= beg + len )
 253     {
 254         mask_beg = beg - sub_beg;
 255
 256         if ( mask_beg < 0 ) {
 257             mask_beg = 0;
 258         }
 259
 260         mask_len = len;
 261
 262         if ( sub_len < mask_len ) {
 263             mask_len = sub_len;
 264         }
 265
 266         for ( i = mask_beg; i < mask_beg + mask_len; i++ ) {
 267             seq[ i ] = seq[ i ] ^ ' ';
 268         }
 269     }
 270 }
 271
 272 END_C
 273
 274
 275 use Maasha::Common;
 276 use Maasha::Fasta;
 277 use Maasha::Seq;
 278
 279 use constant {
 280     SEQ_NAME => 0,
 281     SEQ      => 1,
 282 };
 283
 284 @ISA = qw( Exporter );
 285
 286
 287 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 288
 289
 290 sub twobit_get_TOC
 291 {
 292     # Martin A. Hansen, March 2008.
 293
 294     # Fetches the table of contents (TOC) from a 2bit file.
 295     # The TOC is returned as a list of lists.
 296
 297     # The 2bit format is described here:
 298     # http://genome.ucsc.edu/FAQ/FAQformat#format7
 299
 300     my ( $fh,   # filehandle
 301        ) = @_;
 302
 303     # Returns AoA.
 304
 305     my ( $signature, $version, $seq_count, $reserved, $i, $seq_name_size, $string, $seq_name, $offset, @AoA );
 306
 307     sysseek $fh, 0, 0;
 308
 309     $signature = unpack_32bit( $fh );
 310     $version   = unpack_32bit( $fh );
 311     $seq_count = unpack_32bit( $fh );
 312     $reserved  = unpack_32bit( $fh );
 313
 314     Maasha::Common::error( qq(2bit file signature didn't match - inverse bit order?) ) if $signature != 0x1A412743;
 315
 316     for ( $i = 0; $i < $seq_count; $i++ )
 317     {
 318         $seq_name_size = unpack_8bit( $fh );
 319
 320         sysread $fh, $string, $seq_name_size;
 321
 322         $seq_name = unpack( "A$seq_name_size", $string );
 323
 324         $offset = unpack_32bit( $fh );
 325
 326         push @AoA, [ $seq_name, $offset ];
 327     }
 328
 329     return wantarray ? @AoA : \@AoA;
 330 }
 331
 332
 333 sub twobit_get_seq
 334 {
 335     # Martin A. Hansen, March 2008.
 336
 337     # Given a filehandle to a 2bit file, gets the sequence
 338     # or subsequence from an 2bit file entry at the given
 339     # offset position.
 340
 341     # The 2bit format is described here:
 342     # http://genome.ucsc.edu/FAQ/FAQformat#format7
 343
 344     my ( $fh,        # filehandle
 345          $offset,    # byte position
 346          $sub_beg,   # begin of subsequence                - OPTIONAL
 347          $sub_len,   # length of subsequence               - OPTIONAL
 348          $mask,      # retrieve soft mask information flag - OPTIONAL
 349        ) = @_;
 350
 351     # Returns a string.
 352
 353     my ( $string, $seq_len, $n_count, $n, @n_begs, @n_sizes, $m_count, $m, @m_begs, @m_sizes, $reserved, $seq );
 354
 355     $sub_beg ||= 0;
 356     $sub_len ||= 9999999999;
 357
 358     sysseek $fh, $offset, 0;
 359
 360     $seq_len = unpack_32bit( $fh );
 361     $sub_len = $seq_len if $sub_len > $seq_len;
 362
 363     $n_count = unpack_32bit( $fh );
 364
 365     map { push @n_begs,  unpack_32bit( $fh ) } 1 .. $n_count;
 366     map { push @n_sizes, unpack_32bit( $fh ) } 1 .. $n_count;
 367
 368     $m_count = unpack_32bit( $fh );
 369
 370     map { push @m_begs,  unpack_32bit( $fh ) } 1 .. $m_count;
 371     map { push @m_sizes, unpack_32bit( $fh ) } 1 .. $m_count;
 372
 373     $reserved = unpack_32bit( $fh );
 374
 375     $offset += 4 + 4 + $n_count * 8 + 4 + $m_count * 8 + 4;
 376
 377     $seq = unpack_dna( $fh, $offset, $sub_beg, $sub_len );
 378
 379     for ( $n = 0; $n < $n_count; $n++ )
 380     {
 381         hard_mask( $seq, $n_begs[ $n ], $n_sizes[ $n ], $sub_beg, $sub_len );
 382
 383         last if $sub_beg + $sub_len < $n_begs[ $n ];
 384     }
 385
 386     if ( $mask )
 387     {
 388         for ( $m = 0; $m < $m_count; $m++ )
 389         {
 390             soft_mask( $seq, $m_begs[ $m ], $m_sizes[ $m ], $sub_beg, $sub_len );
 391
 392             last if $sub_beg + $sub_len < $m_begs[ $m ];
 393         }
 394     }
 395
 396     return $seq;
 397 }
 398
 399
 400 sub unpack_8bit
 401 {
 402     # Martin A. Hansen, March 2008.
 403
 404     # Reads in 8 bits from the given filehandle
 405     # and returns the encoded value.
 406
 407     # NB swap still needs fixing.
 408
 409     my ( $fh,     # filehandle
 410          $swap,   # bit order swap flag  -  OPTIONAL
 411        ) = @_;
 412
 413     # Returns integer.
 414
 415     my ( $string, $val );
 416
 417     sysread $fh, $string, 1;
 418
 419     $val = unpack( "C", $string );
 420
 421     return $val;
 422 }
 423
 424
 425 sub unpack_32bit
 426 {
 427     # Martin A. Hansen, March 2008.
 428
 429     # Reads in 32 bits from the given filehandle
 430     # and returns the encoded value.
 431
 432     my ( $fh,     # filehandle
 433          $swap,   # bit order swap flag  -  OPTIONAL
 434        ) = @_;
 435
 436     # Returns integer.
 437
 438     my ( $string, $val );
 439
 440     sysread $fh, $string, 4;
 441
 442     if ( $swap ) {
 443         $val = unpack( "N", $string );
 444     } else {
 445         $val = unpack( "V", $string );
 446     }
 447
 448     return $val;
 449 }
 450
 451
 452 sub unpack_dna
 453 {
 454     # Martin A. Hansen, March 2008.
 455
 456     # Unpacks the DNA beginning at the given filehandle.
 457     # The DNA packed to two bits per base, where the first
 458     # base is in the most significant 2-bit byte; the last
 459     # base is in the least significant 2 bits. The packed
 460     # DNA field is padded with 0 bits as necessary to take
 461     # an even multiple of 32 bits in the file.
 462
 463     # NB swap still needs fixing.
 464
 465     my ( $fh,       # filehandle
 466          $offset,   # file offset
 467          $beg,      # sequence beg
 468          $len,      # sequence length
 469          $swap,     # bit order swap flag  -  OPTIONAL
 470        ) = @_;
 471
 472     # Returns a string.
 473
 474     my ( $bin, $bin_beg, $bin_len, $dna, $bin_diff, $len_diff );
 475
 476     $bin_beg = int( $beg / 4 );
 477     $bin_beg-- if $beg % 4;
 478     $bin_beg = 0 if $bin_beg < 0;
 479
 480     $bin_len = int( $len / 4 );
 481     $bin_len++ if $len % 4;
 482
 483     sysseek $fh, $offset + $bin_beg, 0;
 484     sysread $fh, $bin, $bin_len;
 485
 486     $dna = bin2dna( $bin, $bin_len );
 487
 488     $bin_diff = $beg - $bin_beg * 4;
 489     $len_diff = $bin_len * 4 - $len;
 490
 491     $dna =~ s/^.{$bin_diff}// if $bin_diff;
 492     $dna =~ s/.{$len_diff}$// if $len_diff;
 493
 494     return $dna;
 495 }
 496
 497
 498 sub fasta2twobit
 499 {
 500     # Martin A. Hansen, March 2008.
 501
 502     # Converts a FASTA file to 2bit format.
 503
 504     my ( $fh_in,    # file handle to FASTA file
 505          $fh_out,   # output file handle      -  OPTIONAL
 506          $mask,     # preserver soft masking  -  OPTIONAL
 507        ) = @_;
 508
 509     my ( $seq_offset, $offset, $entry, $mask_index, $seq_len, $seq_name_len, $pack_len, $rec_len, $index, $bin, $seq );
 510
 511     $fh_out = \*STDOUT if not $fh_out;
 512
 513     # ---- Creating content index ----
 514
 515     $seq_offset = 0;    # offset for reading sequence from FASTA file
 516     $offset     = 16;   # offset starting after header line which is 16 bytes
 517
 518     while ( $entry = Maasha::Fasta::get_entry( $fh_in ) )
 519     {
 520         $seq_len      = length $entry->[ SEQ ];
 521         $seq_name_len = length $entry->[ SEQ_NAME ];
 522
 523         $mask_index = mask_locate( $entry->[ SEQ ], $mask );
 524
 525         $pack_len = ( $seq_len + ( 4 - ( $seq_len ) % 4 ) ) / 4;
 526
 527         $rec_len = (
 528               4                                # Sequence length
 529             + 4                                # N blocks
 530             + 4 * $mask_index->{ "N_COUNT" }   # N begins
 531             + 4 * $mask_index->{ "N_COUNT" }   # N lengths
 532             + 4                                # M blocks
 533             + 4 * $mask_index->{ "M_COUNT" }   # M begins
 534             + 4 * $mask_index->{ "M_COUNT" }   # M lengths
 535             + 4                                # reserved
 536             + $pack_len                        # Packed DNA - 32 bit multiplum of 2 bit/base sequence in bytes
 537         );
 538
 539         push @{ $index }, {
 540             SEQ_NAME     => $entry->[ SEQ_NAME ],
 541             SEQ_NAME_LEN => $seq_name_len,
 542             SEQ_BEG      => $seq_offset + $seq_name_len + 2,
 543             SEQ_LEN      => $seq_len,
 544             N_COUNT      => $mask_index->{ "N_COUNT" },
 545             N_BEGS       => $mask_index->{ "N_BEGS" },
 546             N_LENS       => $mask_index->{ "N_LENS" },
 547             M_COUNT      => $mask_index->{ "M_COUNT" },
 548             M_BEGS       => $mask_index->{ "M_BEGS" },
 549             M_LENS       => $mask_index->{ "M_LENS" },
 550             REC_LEN      => $rec_len,
 551         };
 552
 553         $offset += (
 554             + 1              # 1 byte SEQ_NAME size
 555             + $seq_name_len  # SEQ_NAME depending on SEQ_NAME size
 556             + 4              # 32 bit offset position of sequence record
 557         );
 558
 559         $seq_offset += $seq_name_len + 2 + $seq_len + 1;
 560     }
 561
 562     # ---- Printing Header ----
 563
 564     $bin = pack( "V4", oct "0x1A412743", "0", scalar @{ $index }, 0 ); # signature, version, sequence count and reserved
 565
 566     print $fh_out $bin;
 567
 568     # ---- Printing TOC ----
 569
 570     undef $bin;
 571
 572     foreach $entry ( @{ $index } )
 573     {
 574         $bin .= pack( "C", $entry->{ "SEQ_NAME_LEN" } );                           # 1 byte SEQ_NAME size
 575         $bin .= pack( qq(A$entry->{ "SEQ_NAME_LEN" }), $entry->{ "SEQ_NAME" } );   # SEQ_NAME depending on SEQ_NAME size
 576         $bin .= pack( "V", $offset );                                              # 32 bit offset position of sequence record
 577
 578         $offset += $entry->{ "REC_LEN" };
 579     }
 580
 581     print $fh_out $bin;
 582
 583     # ---- Printing Records ----
 584
 585     foreach $entry ( @{ $index } )
 586     {
 587         undef $bin;
 588
 589         $bin .= pack( "V", $entry->{ "SEQ_LEN" } );
 590         $bin .= pack( "V", $entry->{ "N_COUNT" } );
 591
 592         map { $bin .= pack( "V", $_ ) } @{ $entry->{ "N_BEGS" } };
 593         map { $bin .= pack( "V", $_ ) } @{ $entry->{ "N_LENS" } };
 594
 595         $bin .= pack( "V", $entry->{ "M_COUNT" } );
 596
 597         map { $bin .= pack( "V", $_ ) } @{ $entry->{ "M_BEGS" } };
 598         map { $bin .= pack( "V", $_ ) } @{ $entry->{ "M_LENS" } };
 599
 600         $bin .= pack( "V", 0 );
 601
 602         sysseek $fh_in, $entry->{ "SEQ_BEG" }, 0;
 603         sysread $fh_in, $seq, $entry->{ "SEQ_LEN" };
 604
 605         $seq = uc $seq;
 606         $seq =~ tr/RYWSMKHDVBN/TTTTTTTTTTT/;
 607
 608         $bin .= pack_dna( $seq );
 609
 610         print $fh_out $bin;
 611     }
 612
 613     close $fh_in;
 614     close $fh_out;
 615 }
 616
 617
 618 sub pack_dna
 619 {
 620     # Martin A. Hansen, March 2008.
 621
 622     # Packs a DNA sequence into a bit array, The DNA packed to two bits per base,
 623     # represented as so: T - 00, C - 01, A - 10, G - 11. The first base is
 624     # in the most significant 2-bit byte; the last base is in the least significant
 625     # 2 bits. For example, the sequence TCAG is represented as 00011011.
 626     # The packedDna field is padded with 0 bits as necessary to take an even
 627     # multiple of 32 bits in the file.
 628
 629     my ( $dna,   # dna string to pack
 630        ) = @_;
 631
 632     # Returns bit array
 633
 634     my ( $bin );
 635
 636     $dna .= "T" x ( 4 - ( length( $dna ) % 4 ) );
 637
 638     $bin = dna2bin( $dna, length $dna );
 639
 640     return $bin;
 641 }
 642
 643
 644 sub mask_locate
 645 {
 646     # Martin A. Hansen, March 2008.
 647
 648     # Locate N-blocks and M-blocks in a given sequence.
 649     # These blocks a continously streches of Ns and Ms in a string,
 650     # and the begins and lenghts of these blocks are saved in a
 651     # hash along with the count of each block type.
 652
 653     my ( $seq,   # Sequence
 654          $mask,  # preserve soft masking flag  -  OPTIONAL
 655        ) = @_;
 656
 657     # Returns a hash.
 658
 659     my ( $n_mask, $m_mask, $seq_len, $pos, $n_beg, $n_len, $m_beg, $m_len, @n_begs, @n_lens, @m_begs, @m_lens, %mask_hash );
 660
 661     $seq =~ tr/atcgunRYWSMKHDVBrywsmkhdvb/MMMMMNNNNNNNNNNNNNNNNNNNNN/;
 662
 663     $n_mask = 1;   # always mask Ns.
 664     $m_mask = $mask || 0;
 665
 666     $seq_len = length $seq;
 667
 668     $pos = 0;
 669
 670     while ( $n_mask or $m_mask )
 671     {
 672         if ( $n_mask )
 673         {
 674             $n_beg = find_block_beg( $seq, "N", $pos, $seq_len );
 675
 676             $n_mask = 0 if $n_beg < 0;
 677         }
 678
 679         if ( $m_mask )
 680         {
 681             $m_beg = find_block_beg( $seq, "M", $pos, $seq_len );
 682
 683             $m_mask = 0 if $m_beg < 0;
 684         }
 685
 686         if ( $n_mask and $m_mask )
 687         {
 688             if ( $n_beg < $m_beg )
 689             {
 690                 $n_len = find_block_len( $seq, "N", $n_beg, $seq_len );
 691
 692                 push @n_begs, $n_beg;
 693                 push @n_lens, $n_len;
 694
 695                 $pos = $n_beg + $n_len;
 696             }
 697             else
 698             {
 699                 $m_len = find_block_len( $seq, "M", $m_beg, $seq_len );
 700
 701                 push @m_begs, $m_beg;
 702                 push @m_lens, $m_len;
 703
 704                 $pos = $m_beg + $m_len;
 705             }
 706         }
 707         elsif ( $n_mask )
 708         {
 709             $n_len = find_block_len( $seq, "N", $n_beg, $seq_len );
 710
 711             push @n_begs, $n_beg;
 712             push @n_lens, $n_len;
 713
 714             $pos = $n_beg + $n_len;
 715         }
 716         elsif ( $m_mask )
 717         {
 718             $m_len = find_block_len( $seq, "M", $m_beg, $seq_len );
 719
 720             push @m_begs, $m_beg;
 721             push @m_lens, $m_len;
 722
 723             $pos = $m_beg + $m_len;
 724         }
 725         else
 726         {
 727             last;
 728         }
 729     }
 730
 731     %mask_hash = (
 732         N_COUNT      => scalar @n_begs,
 733         N_BEGS       => [ @n_begs ],
 734         N_LENS       => [ @n_lens ],
 735         M_COUNT      => scalar @m_begs,
 736         M_BEGS       => [ @m_begs ],
 737         M_LENS       => [ @m_lens ],
 738     );
 739
 740     return wantarray ? %mask_hash : \%mask_hash;
 741 }
 742
 743
 744 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 745
 746
 747 1;