examples/perlmask.pl

   1 #!/usr/bin/perl -w
   2 use strict;
   3
   4 # Walk through a perl script and create a masked file which is
   5 # similar but which masks comments, quotes, patterns, and non-code
   6 # lines so that it is easy to parse with regular expressions.
   7 #
   8 # usage:
   9 # perlmask [-cn]  myfile.pl >myfile.new
  10 # perlmask [-cn] <myfile.pl >myfile.new
  11 #
  12 # In the masked file,
  13 #  -comments and pod will be masked (or removed)
  14 #  -here-doc text lines will be masked (or removed)
  15 #  -quotes and patterns, qw quotes, and here doc << operators will be
  16 #   replaced by the letters 'Q', 'q', or 'h'
  17 #
  18 # The result is a file in which all braces, parens, and square brackets
  19 # are balanced, and it can be parsed relatively easily by regular
  20 # expressions.
  21 #
  22 # -cn is an optional 'compression' flag.  By default the masked file will have
  23 # the same number of characters as the input file, with the difference being
  24 # that certain characters will be changed (masked).
  25 #
  26 # If character position correspondence is not required, the size of the masked
  27 # file can be significantly reduced by increasing the 'compression' level as
  28 # follows:
  29 #
  30 # -c0 all mask file line numbers and character positions agree with
  31 #     original file (DEFAULT)
  32 # -c1 line numbers agree and character positions agree within lines of code
  33 # -c2 line numbers agree but character positions do not
  34 # -c3 no correspondence between line numbers or character positions
  35 #
  36 # Try each of these on a file of significant size to see how they work.
  37 # The default, -c0, is required if you are working with character positions
  38 # that span multiple lines.  The other levels may be useful if you
  39 # do not need this level of correspondence.
  40 #
  41 # This file is one of the examples distributed with perltidy and demonstrates
  42 # using a callback object with Perl::Tidy to walk through a perl file and find
  43 # all of its tokens.  It can be useful for simple perl code parsing tasks.  It
  44 # might even be helpful in debugging.  Or you may want to modify it to suit
  45 # your own purposes.
  46 #
  47 use Getopt::Std;
  48 use IO::File;
  49 $| = 1;
  50 use vars qw($opt_c $opt_h);
  51 my $usage = <<EOM;
  52    usage: perlmask [ -cn ] filename >outfile
  53 EOM
  54 getopts('c:h') or die "$usage";
  55 if ($opt_h) { die $usage }
  56 unless ( defined($opt_c) ) { $opt_c = 0 }
  57 if (@ARGV > 1) { die $usage }
  58
  59 my $source=$ARGV[0];   # an undefined filename will become stdin
  60
  61 # strings to hold the files (arrays could be used to)
  62 my ( $masked_file, $original_file );
  63
  64 PerlMask::perlmask(
  65     _source         => $source,
  66     _rmasked_file   => \$masked_file,
  67     _roriginal_file => \$original_file,    # optional
  68     _compression    => $opt_c              # optional, default=0
  69 );
  70
  71 # Now we have the masked and original files in strings of equal length.
  72 # We could search for specific text in the masked file here.  But here
  73 # we'll just print the masked file:
  74 if ($masked_file) { print $masked_file; }
  75
  76 #####################################################################
  77 #
  78 # The PerlMask package is an interface to perltidy which accepts a
  79 # source filehandle and returns a 'masked' version of the source as
  80 # a string or array.  It can also optionally return the original file
  81 # as a string or array.
  82 #
  83 # It works by making a callback object with a write_line() method to
  84 # receive tokenized lines from perltidy.  This write_line method
  85 # selectively replaces tokens with either their original text or with a
  86 # benign masking character (such as '#' or 'Q').
  87 #
  88 # Usage:
  89 #
  90 #   PerlMask::perlmask(
  91 #       _source         => $fh,             # required source
  92 #       _rmasked_file   => \$masked_file,   # required ref to ARRAY or SCALAR
  93 #       _roriginal_file => \$original_file, # optional ref to ARRAY or SCALAR
  94 #       _compression    => $opt_c           # optional
  95 #   );
  96 #
  97 # _source is any source that perltidy will accept, including a
  98 # filehandle or reference to SCALAR or ARRAY
  99 #
 100 # The compression flag may have these values:
 101 #  0 all mask file line numbers and character positions agree with
 102 #    original file (DEFAULT)
 103 #  1 line numbers agree and character positions agree within lines of code
 104 #  2 line numbers agree but character positions do not
 105 #  3 no correspondence between line numbers or character positions
 106 #
 107 #####################################################################
 108
 109 package PerlMask;
 110 use Carp;
 111 use Perl::Tidy;
 112
 113 sub perlmask {
 114
 115     my %args = ( _compression => 0, @_ );
 116     my $rfile = $args{_rmasked_file};
 117     unless ( defined($rfile) ) {
 118         croak
 119           "Missing required parameter '_rmasked_file' in call to perlmask\n";
 120     }
 121     my $ref=ref($rfile);
 122     unless ( $ref =~ /^(SCALAR|ARRAY)$/ ) {
 123             croak <<EOM;
 124 Expecting _rmasked_file = ref to SCALAR or ARRAY in perlmask but got : ($ref)
 125 EOM
 126     }
 127
 128     # run perltidy, which will call $formatter's write_line() for each line
 129     my $err=perltidy(
 130         'source'    => $args{_source},
 131         'formatter' => bless( \%args, __PACKAGE__ ),    # callback object
 132         'argv'        => "-npro -se",    # -npro : ignore .perltidyrc,
 133                                          # -se   : errors to STDOUT
 134     );
 135     if ($err) {
 136         die "Error calling perltidy\n";
 137     }
 138 }
 139
 140 sub print_line {
 141
 142     # called from write_line to dispatch one line (either masked or original)..
 143     # here we'll either append it to a string or array, as appropriate
 144     my ( $rfile, $line ) = @_;
 145     if ( defined($rfile) ) {
 146         if ( ref($rfile) eq 'SCALAR' ) {
 147             $$rfile .= $line . "\n";
 148         }
 149         elsif ( ref($rfile) eq 'ARRAY' ) {
 150             push @{$rfile}, $line . "\n";
 151         }
 152     }
 153 }
 154
 155 sub write_line {
 156
 157     # This is called from perltidy line-by-line
 158     my ( $self, $line_of_tokens ) = @_;
 159     my $rmasked_file   = $self->{_rmasked_file};
 160     my $roriginal_file = $self->{_roriginal_file};
 161     my $opt_c          = $self->{_compression};
 162
 163     my $line_type         = $line_of_tokens->{_line_type};
 164     my $input_line_number = $line_of_tokens->{_line_number};
 165     my $input_line        = $line_of_tokens->{_line_text};
 166     my $rtoken_type       = $line_of_tokens->{_rtoken_type};
 167     my $rtokens           = $line_of_tokens->{_rtokens};
 168     chomp $input_line;
 169
 170     # mask non-CODE lines
 171     if ( $line_type ne 'CODE' ) {
 172         return if ( $opt_c == 3 );
 173         my $len = length($input_line);
 174         if ( $opt_c == 0 && $len > 0 ) {
 175             print_line( $roriginal_file, $input_line ) if $roriginal_file;
 176             print_line( $rmasked_file, '#' x $len );
 177         }
 178         else {
 179             print_line( $roriginal_file, $input_line ) if $roriginal_file;
 180             print_line( $rmasked_file, "" );
 181         }
 182         return;
 183     }
 184
 185     # we'll build the masked line token by token
 186     my $masked_line = "";
 187
 188     # add leading spaces if not in a higher compression mode
 189     if ( $opt_c <= 1 ) {
 190
 191         # Find leading whitespace.  But be careful..we don't want the
 192         # whitespace if it is part of quoted text, because it will
 193         # already be contained in a token.
 194         if ( $input_line =~ /^(\s+)/ && !$line_of_tokens->{_starting_in_quote} )
 195         {
 196             $masked_line = $1;
 197         }
 198     }
 199
 200     # loop over tokens to construct one masked line
 201     for ( my $j = 0 ; $j < @$rtoken_type ; $j++ ) {
 202
 203         # Mask certain token types by replacing them with their type code:
 204         # type  definition
 205         # ----  ----------
 206         # Q     quote or pattern
 207         # q     qw quote
 208         # h     << here doc operator
 209         # #     comment
 210         #
 211         # This choice will produce a mask file that has balanced
 212         # container tokens and does not cause parsing problems.
 213         if ( $$rtoken_type[$j] =~ /^[Qqh]$/ ) {
 214             if ( $opt_c <= 1 ) {
 215                 $masked_line .= $$rtoken_type[$j] x length( $$rtokens[$j] );
 216             }
 217             else {
 218                 $masked_line .= $$rtoken_type[$j];
 219             }
 220         }
 221
 222         # Mask a comment
 223         elsif ( $$rtoken_type[$j] eq '#' ) {
 224             if ( $opt_c == 0 ) {
 225                 $masked_line .= '#' x length( $$rtokens[$j] );
 226             }
 227         }
 228
 229         # All other tokens go out verbatim
 230         else {
 231             $masked_line .= $$rtokens[$j];
 232         }
 233     }
 234     print_line( $roriginal_file, $input_line ) if $roriginal_file;
 235     print_line( $rmasked_file, $masked_line );
 236
 237     # self-check lengths; this error should never happen
 238     if ( $opt_c == 0 && length($masked_line) != length($input_line) ) {
 239         my $lmask  = length($masked_line);
 240         my $linput = length($input_line);
 241         print STDERR
 242 "$input_line_number: length ERROR, masked length=$lmask but input length=$linput\n";
 243     }
 244 }
 245
 246 # called once after the last line of a file
 247 sub finish_formatting {
 248     my $self = shift;
 249     return;
 250 }