4 # Walk through a perl script and create a masked file which is
5 # similar but which masks comments, quotes, patterns, and non-code
6 # lines so that it is easy to parse with regular expressions.
9 # perlmask [-cn] myfile.pl >myfile.new
10 # perlmask [-cn] <myfile.pl >myfile.new
13 # -comments and pod will be masked (or removed)
14 # -here-doc text lines will be masked (or removed)
15 # -quotes and patterns, qw quotes, and here doc << operators will be
16 # replaced by the letters 'Q', 'q', or 'h'
18 # The result is a file in which all braces, parens, and square brackets
19 # are balanced, and it can be parsed relatively easily by regular
22 # -cn is an optional 'compression' flag. By default the masked file will have
23 # the same number of characters as the input file, with the difference being
24 # that certain characters will be changed (masked).
26 # If character position correspondence is not required, the size of the masked
27 # file can be significantly reduced by increasing the 'compression' level as
30 # -c0 all mask file line numbers and character positions agree with
31 # original file (DEFAULT)
32 # -c1 line numbers agree and character positions agree within lines of code
33 # -c2 line numbers agree but character positions do not
34 # -c3 no correspondence between line numbers or character positions
36 # Try each of these on a file of significant size to see how they work.
37 # The default, -c0, is required if you are working with character positions
38 # that span multiple lines. The other levels may be useful if you
39 # do not need this level of correspondence.
41 # This file is one of the examples distributed with perltidy and demonstrates
42 # using a callback object with Perl::Tidy to walk through a perl file and find
43 # all of its tokens. It can be useful for simple perl code parsing tasks. It
44 # might even be helpful in debugging. Or you may want to modify it to suit
50 use vars qw($opt_c $opt_h);
52 usage: perlmask [ -cn ] filename >outfile
54 getopts('c:h') or die "$usage";
55 if ($opt_h) { die $usage }
56 unless ( defined($opt_c) ) { $opt_c = 0 }
57 if (@ARGV > 1) { die $usage }
59 my $source=$ARGV[0]; # an undefined filename will become stdin
61 # strings to hold the files (arrays could be used to)
62 my ( $masked_file, $original_file );
66 _rmasked_file => \$masked_file,
67 _roriginal_file => \$original_file, # optional
68 _compression => $opt_c # optional, default=0
71 # Now we have the masked and original files in strings of equal length.
72 # We could search for specific text in the masked file here. But here
73 # we'll just print the masked file:
74 if ($masked_file) { print $masked_file; }
76 #####################################################################
78 # The PerlMask package is an interface to perltidy which accepts a
79 # source filehandle and returns a 'masked' version of the source as
80 # a string or array. It can also optionally return the original file
81 # as a string or array.
83 # It works by making a a callback object with a write_line() method to
84 # receive tokenized lines from perltidy. This write_line method
85 # selectively replaces tokens with either their original text or with a
86 # benign masking character (such as '#' or 'Q').
91 # _source => $fh, # required source
92 # _rmasked_file => \$masked_file, # required ref to ARRAY or SCALAR
93 # _roriginal_file => \$original_file, # optional ref to ARRAY or SCALAR
94 # _compression => $opt_c # optional
97 # _source is any source that perltidy will accept, including a
98 # filehandle or reference to SCALAR or ARRAY
100 # The compression flag may have these values:
101 # 0 all mask file line numbers and character positions agree with
102 # original file (DEFAULT)
103 # 1 line numbers agree and character positions agree within lines of code
104 # 2 line numbers agree but character positions do not
105 # 3 no correspondence between line numbers or character positions
107 #####################################################################
115 my %args = ( _compression => 0, @_ );
116 my $rfile = $args{_rmasked_file};
117 unless ( defined($rfile) ) {
119 "Missing required parameter '_rmasked_file' in call to perlmask\n";
122 unless ( $ref =~ /^(SCALAR|ARRAY)$/ ) {
124 Expecting _rmasked_file = ref to SCALAR or ARRAY in perlmask but got : ($ref)
128 # run perltidy, which will call $formatter's write_line() for each line
130 'source' => $args{_source},
131 'formatter' => bless( \%args, __PACKAGE__ ), # callback object
132 'argv' => "-npro -se", # -npro : ignore .perltidyrc,
133 # -se : errors to STDOUT
139 # called from write_line to dispatch one line (either masked or original)..
140 # here we'll either append it to a string or array, as appropriate
141 my ( $rfile, $line ) = @_;
142 if ( defined($rfile) ) {
143 if ( ref($rfile) eq 'SCALAR' ) {
144 $$rfile .= $line . "\n";
146 elsif ( ref($rfile) eq 'ARRAY' ) {
147 push @{$rfile}, $line . "\n";
154 # This is called from perltidy line-by-line
155 my ( $self, $line_of_tokens ) = @_;
156 my $rmasked_file = $self->{_rmasked_file};
157 my $roriginal_file = $self->{_roriginal_file};
158 my $opt_c = $self->{_compression};
160 my $line_type = $line_of_tokens->{_line_type};
161 my $input_line_number = $line_of_tokens->{_line_number};
162 my $input_line = $line_of_tokens->{_line_text};
163 my $rtoken_type = $line_of_tokens->{_rtoken_type};
164 my $rtokens = $line_of_tokens->{_rtokens};
167 # mask non-CODE lines
168 if ( $line_type ne 'CODE' ) {
169 return if ( $opt_c == 3 );
170 my $len = length($input_line);
171 if ( $opt_c == 0 && $len > 0 ) {
172 print_line( $roriginal_file, $input_line ) if $roriginal_file;
173 print_line( $rmasked_file, '#' x $len );
176 print_line( $roriginal_file, $input_line ) if $roriginal_file;
177 print_line( $rmasked_file, "" );
182 # we'll build the masked line token by token
183 my $masked_line = "";
185 # add leading spaces if not in a higher compression mode
188 # Find leading whitespace. But be careful..we don't want the
189 # whitespace if it is part of quoted text, because it will
190 # already be contained in a token.
191 if ( $input_line =~ /^(\s+)/ && !$line_of_tokens->{_starting_in_quote} )
197 # loop over tokens to construct one masked line
198 for ( my $j = 0 ; $j < @$rtoken_type ; $j++ ) {
200 # Mask certain token types by replacing them with their type code:
205 # h << here doc operator
208 # This choice will produce a mask file that has balanced
209 # container tokens and does not cause parsing problems.
210 if ( $$rtoken_type[$j] =~ /^[Qqh]$/ ) {
212 $masked_line .= $$rtoken_type[$j] x length( $$rtokens[$j] );
215 $masked_line .= $$rtoken_type[$j];
220 elsif ( $$rtoken_type[$j] eq '#' ) {
222 $masked_line .= '#' x length( $$rtokens[$j] );
226 # All other tokens go out verbatim
228 $masked_line .= $$rtokens[$j];
231 print_line( $roriginal_file, $input_line ) if $roriginal_file;
232 print_line( $rmasked_file, $masked_line );
234 # self-check lengths; this error should never happen
235 if ( $opt_c == 0 && length($masked_line) != length($input_line) ) {
236 my $lmask = length($masked_line);
237 my $linput = length($input_line);
239 "$input_line_number: length ERROR, masked length=$lmask but input length=$linput\n";
243 # called once after the last line of a file
244 sub finish_formatting {