examples/perlxmltok.pl

   1 #!/usr/bin/perl -w
   2 use strict;
   3 #
   4 # Convert a perl script into an xml file
   5 #
   6 # usage:
   7 # perlxmltok myfile.pl >myfile.xml
   8 # perlxmltok <myfile.pl >myfile.xml
   9 #
  10 # The script is broken at the line and token level.
  11 #
  12 # This file is one of the examples distributed with perltidy and demonstrates
  13 # using a callback object with Perl::Tidy to walk through a perl file and
  14 # process its tokens.  It may or may not have any actual usefulness.  You can
  15 # modify it to suit your own purposes; see sub get_line().
  16 #
  17 use Perl::Tidy;
  18 use IO::File;
  19 use Getopt::Std;
  20 use vars qw($opt_h);
  21 my $file;
  22 my $usage = <<EOM;
  23    usage: perlxmltok filename >outfile
  24 EOM
  25 getopts('h') or die "$usage";
  26 if ($opt_h) {die $usage}
  27 if ( @ARGV == 1 ) {
  28     $file = $ARGV[0];
  29 }
  30 else { die $usage }
  31 my $source;
  32 my $fh;
  33 if ($file) {
  34     $fh = IO::File->new( $file, 'r' );
  35     unless ($fh) { die "cannot open '$file': $!\n" }
  36     $source = $fh;
  37 }
  38 else {
  39     $source = '-';
  40 }
  41 my $formatter = Perl::Tidy::XmlWriter->new($file);
  42 my $dest;
  43
  44 # start perltidy, which will start calling our write_line()
  45 my $err = perltidy(
  46     'formatter'   => $formatter,    # callback object
  47     'source'      => $source,
  48     'destination' => \$dest,        # not really needed
  49     'argv'        => "-npro -se",   # dont need .perltidyrc
  50                                     # errors to STDOUT
  51 );
  52 if ($err) {
  53     die "Error calling perltidy\n";
  54 }
  55 $fh->close() if $fh;
  56
  57 #####################################################################
  58 #
  59 # The Perl::Tidy::XmlWriter class writes a copy of the input stream in xml
  60 #
  61 #####################################################################
  62
  63 package Perl::Tidy::XmlWriter;
  64
  65 # class variables
  66 use vars qw{
  67   %token_short_names
  68   %short_to_long_names
  69   $rOpts
  70   $missing_html_entities
  71 };
  72
  73 # replace unsafe characters with HTML entity representation if HTML::Entities
  74 # is available
  75 { eval "use HTML::Entities"; $missing_html_entities = $@; }
  76
  77 sub new {
  78
  79     my ( $class, $input_file ) = @_;
  80     my $self = bless { }, $class;
  81
  82     $self->print( <<"HEADER");
  83 <?xml version = "1.0"?>
  84 HEADER
  85
  86     unless ( !$input_file || $input_file eq '-' || ref($input_file) ) {
  87
  88         $self->print( <<"COMMENT");
  89 <!-- created by perltidy from file: $input_file -->
  90 COMMENT
  91     }
  92
  93     $self->print("<file>\n");
  94     return $self;
  95 }
  96
  97 sub print {
  98     my ( $self, $line ) = @_;
  99     print $line;
 100 }
 101
 102 sub write_line {
 103
 104     # This routine will be called once perl line by perltidy
 105     my $self = shift;
 106     my ($line_of_tokens) = @_;
 107     my $line_type        = $line_of_tokens->{_line_type};
 108     my $input_line       = $line_of_tokens->{_line_text};
 109     my $line_number      = $line_of_tokens->{_line_number};
 110     chomp $input_line;
 111     $self->print(" <line type='$line_type'>\n");
 112     $self->print("  <text>\n");
 113
 114     $input_line = my_encode_entities($input_line);
 115     $self->print("$input_line\n");
 116     $self->print("  </text>\n");
 117
 118     # markup line of code..
 119     if ( $line_type eq 'CODE' ) {
 120         my $xml_line;
 121         my $rtoken_type = $line_of_tokens->{_rtoken_type};
 122         my $rtokens     = $line_of_tokens->{_rtokens};
 123
 124         if ( $input_line =~ /(^\s*)/ ) {
 125             $xml_line = $1;
 126         }
 127         else {
 128             $xml_line = "";
 129         }
 130         my $rmarked_tokens = $self->markup_tokens( $rtokens, $rtoken_type );
 131         $xml_line .= join '', @$rmarked_tokens;
 132
 133         $self->print("  <tokens>\n");
 134         $self->print("$xml_line\n");
 135         $self->print("  </tokens>\n");
 136     }
 137
 138     $self->print(" </line>\n");
 139 }
 140
 141 BEGIN {
 142
 143     # This is the official list of tokens which may be identified by the
 144     # user.  Long names are used as getopt keys.  Short names are
 145     # convenient short abbreviations for specifying input.  Short names
 146     # somewhat resemble token type characters, but are often different
 147     # because they may only be alphanumeric, to allow command line
 148     # input.  Also, note that because of case insensitivity of xml,
 149     # this table must be in a single case only (I've chosen to use all
 150     # lower case).
 151     # When adding NEW_TOKENS: update this hash table
 152     # short names => long names
 153     %short_to_long_names = (
 154         'n'  => 'numeric',
 155         'p'  => 'paren',
 156         'q'  => 'quote',
 157         's'  => 'structure',
 158         'c'  => 'comment',
 159         'b'  => 'blank',
 160         'v'  => 'v-string',
 161         'cm' => 'comma',
 162         'w'  => 'bareword',
 163         'co' => 'colon',
 164         'pu' => 'punctuation',
 165         'i'  => 'identifier',
 166         'j'  => 'label',
 167         'h'  => 'here-doc-target',
 168         'hh' => 'here-doc-text',
 169         'k'  => 'keyword',
 170         'sc' => 'semicolon',
 171         'm'  => 'subroutine',
 172         'pd' => 'pod-text',
 173     );
 174
 175     # Now we have to map actual token types into one of the above short
 176     # names; any token types not mapped will get 'punctuation'
 177     # properties.
 178
 179     # The values of this hash table correspond to the keys of the
 180     # previous hash table.
 181     # The keys of this hash table are token types and can be seen
 182     # by running with --dump-token-types (-dtt).
 183
 184     # When adding NEW_TOKENS: update this hash table
 185     # $type => $short_name
 186     %token_short_names = (
 187         '#'  => 'c',
 188         'n'  => 'n',
 189         'v'  => 'v',
 190         'b'  => 'b',
 191         'k'  => 'k',
 192         'F'  => 'k',
 193         'Q'  => 'q',
 194         'q'  => 'q',
 195         'J'  => 'j',
 196         'j'  => 'j',
 197         'h'  => 'h',
 198         'H'  => 'hh',
 199         'w'  => 'w',
 200         ','  => 'cm',
 201         '=>' => 'cm',
 202         ';'  => 'sc',
 203         ':'  => 'co',
 204         'f'  => 'sc',
 205         '('  => 'p',
 206         ')'  => 'p',
 207         'M'  => 'm',
 208         'P'  => 'pd',
 209     );
 210
 211     # These token types will all be called identifiers for now
 212     # FIXME: need to separate user defined modules as separate type
 213     my @identifier = qw" i t U C Y Z G :: ";
 214     @token_short_names{@identifier} = ('i') x scalar(@identifier);
 215
 216     # These token types will be called 'structure'
 217     my @structure = qw" { } ";
 218     @token_short_names{@structure} = ('s') x scalar(@structure);
 219
 220 }
 221
 222 sub markup_tokens {
 223     my $self = shift;
 224     my ( $rtokens, $rtoken_type ) = @_;
 225     my ( @marked_tokens, $j, $string, $type, $token );
 226
 227     for ( $j = 0 ; $j < @$rtoken_type ; $j++ ) {
 228         $type  = $$rtoken_type[$j];
 229         $token = $$rtokens[$j];
 230
 231         #-------------------------------------------------------
 232         # Patch : intercept a sub name here and split it
 233         # into keyword 'sub' and sub name
 234         if ( $type eq 'i' && $token =~ /^(sub\s+)(\w.*)$/ ) {
 235             $token = $self->markup_xml_element( $1, 'k' );
 236             push @marked_tokens, $token;
 237             $token = $2;
 238             $type  = 'M';
 239         }
 240
 241         # Patch : intercept a package name here and split it
 242         # into keyword 'package' and name
 243         if ( $type eq 'i' && $token =~ /^(package\s+)(\w.*)$/ ) {
 244             $token = $self->markup_xml_element( $1, 'k' );
 245             push @marked_tokens, $token;
 246             $token = $2;
 247             $type  = 'i';
 248         }
 249         #-------------------------------------------------------
 250
 251         $token = $self->markup_xml_element( $token, $type );
 252         push @marked_tokens, $token;
 253     }
 254     return \@marked_tokens;
 255 }
 256
 257 sub my_encode_entities {
 258     my ($token) = @_;
 259
 260     # escape any characters not allowed in XML content.
 261     # ??s/\92/&apos;/;
 262     if ($missing_html_entities) {
 263         $token =~ s/\&/&amp;/g;
 264         $token =~ s/\</&lt;/g;
 265         $token =~ s/\>/&gt;/g;
 266         $token =~ s/\"/&quot;/g;
 267     }
 268     else {
 269         HTML::Entities::encode_entities($token);
 270     }
 271     return $token;
 272 }
 273
 274 sub markup_xml_element {
 275     my $self = shift;
 276     my ( $token, $type ) = @_;
 277     if ($token) { $token = my_encode_entities($token) }
 278
 279     # get the short abbreviation for this token type
 280     my $short_name = $token_short_names{$type};
 281     if ( !defined($short_name) ) {
 282         $short_name = "pu";    # punctuation is default
 283     }
 284     $token = qq(<$short_name>) . $token . qq(</$short_name>);
 285     return $token;
 286 }
 287
 288 sub finish_formatting {
 289
 290     # called after last line
 291     my $self = shift;
 292     $self->print("</file>\n");
 293     return;
 294 }