perlxmltok.pl

   1 #!/usr/bin/perl -w
   2 use strict;
   3 #
   4 # Convert a perl script into an xml file
   5 #
   6 # usage:
   7 # perlxmltok myfile.pl >myfile.xml
   8 # perlxmltok <myfile.pl >myfile.xml
   9 #
  10 # The script is broken at the line and token level.
  11 #
  12 # This file is one of the examples distributed with perltidy and demonstrates
  13 # using a callback object with Perl::Tidy to walk through a perl file and
  14 # process its tokens.  It may or may not have any actual usefulness.  You can
  15 # modify it to suit your own purposes; see sub get_line().
  16 #
  17 use Perl::Tidy;
  18 use IO::File;
  19 use Getopt::Std;
  20 use vars qw($opt_h);
  21 my $file;
  22 my $usage = <<EOM;
  23    usage: perlxmltok filename >outfile
  24 EOM
  25 getopts('h') or die "$usage";
  26 if ($opt_h) {die $usage}
  27 if ( @ARGV == 1 ) {
  28     $file = $ARGV[0];
  29 }
  30 else { die $usage }
  31 my $source;
  32 my $fh;
  33 if ($file) {
  34     $fh = IO::File->new( $file, 'r' );
  35     unless ($fh) { die "cannot open '$file': $!\n" }
  36     $source = $fh;
  37 }
  38 else {
  39     $source = '-';
  40 }
  41 my $formatter = Perl::Tidy::XmlWriter->new($file);
  42 my $dest;
  43
  44 # start perltidy, which will start calling our write_line()
  45 perltidy(
  46     'formatter'   => $formatter,    # callback object
  47     'source'      => $source,
  48     'destination' => \$dest,        # not really needed
  49     'argv'        => "-npro -se",   # dont need .perltidyrc
  50                                     # errors to STDOUT
  51 );
  52 $fh->close() if $fh;
  53
  54 #####################################################################
  55 #
  56 # The Perl::Tidy::XmlWriter class writes a copy of the input stream in xml
  57 #
  58 #####################################################################
  59
  60 package Perl::Tidy::XmlWriter;
  61
  62 # class variables
  63 use vars qw{
  64   %token_short_names
  65   %short_to_long_names
  66   $rOpts
  67   $missing_html_entities
  68 };
  69
  70 # replace unsafe characters with HTML entity representation if HTML::Entities
  71 # is available
  72 { eval "use HTML::Entities"; $missing_html_entities = $@; }
  73
  74 sub new {
  75
  76     my ( $class, $input_file ) = @_;
  77     my $self = bless { }, $class;
  78
  79     $self->print( <<"HEADER");
  80 <?xml version = "1.0"?>
  81 HEADER
  82
  83     unless ( !$input_file || $input_file eq '-' || ref($input_file) ) {
  84
  85         $self->print( <<"COMMENT");
  86 <!-- created by perltidy from file: $input_file -->
  87 COMMENT
  88     }
  89
  90     $self->print("<file>\n");
  91     return $self;
  92 }
  93
  94 sub print {
  95     my ( $self, $line ) = @_;
  96     print $line;
  97 }
  98
  99 sub write_line {
 100
 101     # This routine will be called once perl line by perltidy
 102     my $self = shift;
 103     my ($line_of_tokens) = @_;
 104     my $line_type        = $line_of_tokens->{_line_type};
 105     my $input_line       = $line_of_tokens->{_line_text};
 106     my $line_number      = $line_of_tokens->{_line_number};
 107     chomp $input_line;
 108     $self->print(" <line type='$line_type'>\n");
 109     $self->print("  <text>\n");
 110
 111     $input_line = my_encode_entities($input_line);
 112     $self->print("$input_line\n");
 113     $self->print("  </text>\n");
 114
 115     # markup line of code..
 116     if ( $line_type eq 'CODE' ) {
 117         my $xml_line;
 118         my $rtoken_type = $line_of_tokens->{_rtoken_type};
 119         my $rtokens     = $line_of_tokens->{_rtokens};
 120
 121         if ( $input_line =~ /(^\s*)/ ) {
 122             $xml_line = $1;
 123         }
 124         else {
 125             $xml_line = "";
 126         }
 127         my $rmarked_tokens = $self->markup_tokens( $rtokens, $rtoken_type );
 128         $xml_line .= join '', @$rmarked_tokens;
 129
 130         $self->print("  <tokens>\n");
 131         $self->print("$xml_line\n");
 132         $self->print("  </tokens>\n");
 133     }
 134
 135     $self->print(" </line>\n");
 136 }
 137
 138 BEGIN {
 139
 140     # This is the official list of tokens which may be identified by the
 141     # user.  Long names are used as getopt keys.  Short names are
 142     # convenient short abbreviations for specifying input.  Short names
 143     # somewhat resemble token type characters, but are often different
 144     # because they may only be alphanumeric, to allow command line
 145     # input.  Also, note that because of case insensitivity of xml,
 146     # this table must be in a single case only (I've chosen to use all
 147     # lower case).
 148     # When adding NEW_TOKENS: update this hash table
 149     # short names => long names
 150     %short_to_long_names = (
 151         'n'  => 'numeric',
 152         'p'  => 'paren',
 153         'q'  => 'quote',
 154         's'  => 'structure',
 155         'c'  => 'comment',
 156         'b'  => 'blank',
 157         'v'  => 'v-string',
 158         'cm' => 'comma',
 159         'w'  => 'bareword',
 160         'co' => 'colon',
 161         'pu' => 'punctuation',
 162         'i'  => 'identifier',
 163         'j'  => 'label',
 164         'h'  => 'here-doc-target',
 165         'hh' => 'here-doc-text',
 166         'k'  => 'keyword',
 167         'sc' => 'semicolon',
 168         'm'  => 'subroutine',
 169         'pd' => 'pod-text',
 170     );
 171
 172     # Now we have to map actual token types into one of the above short
 173     # names; any token types not mapped will get 'punctuation'
 174     # properties.
 175
 176     # The values of this hash table correspond to the keys of the
 177     # previous hash table.
 178     # The keys of this hash table are token types and can be seen
 179     # by running with --dump-token-types (-dtt).
 180
 181     # When adding NEW_TOKENS: update this hash table
 182     # $type => $short_name
 183     %token_short_names = (
 184         '#'  => 'c',
 185         'n'  => 'n',
 186         'v'  => 'v',
 187         'b'  => 'b',
 188         'k'  => 'k',
 189         'F'  => 'k',
 190         'Q'  => 'q',
 191         'q'  => 'q',
 192         'J'  => 'j',
 193         'j'  => 'j',
 194         'h'  => 'h',
 195         'H'  => 'hh',
 196         'w'  => 'w',
 197         ','  => 'cm',
 198         '=>' => 'cm',
 199         ';'  => 'sc',
 200         ':'  => 'co',
 201         'f'  => 'sc',
 202         '('  => 'p',
 203         ')'  => 'p',
 204         'M'  => 'm',
 205         'P'  => 'pd',
 206     );
 207
 208     # These token types will all be called identifiers for now
 209     # FIXME: need to separate user defined modules as separate type
 210     my @identifier = qw" i t U C Y Z G :: ";
 211     @token_short_names{@identifier} = ('i') x scalar(@identifier);
 212
 213     # These token types will be called 'structure'
 214     my @structure = qw" { } ";
 215     @token_short_names{@structure} = ('s') x scalar(@structure);
 216
 217 }
 218
 219 sub markup_tokens {
 220     my $self = shift;
 221     my ( $rtokens, $rtoken_type ) = @_;
 222     my ( @marked_tokens, $j, $string, $type, $token );
 223
 224     for ( $j = 0 ; $j < @$rtoken_type ; $j++ ) {
 225         $type  = $$rtoken_type[$j];
 226         $token = $$rtokens[$j];
 227
 228         #-------------------------------------------------------
 229         # Patch : intercept a sub name here and split it
 230         # into keyword 'sub' and sub name
 231         if ( $type eq 'i' && $token =~ /^(sub\s+)(\w.*)$/ ) {
 232             $token = $self->markup_xml_element( $1, 'k' );
 233             push @marked_tokens, $token;
 234             $token = $2;
 235             $type  = 'M';
 236         }
 237
 238         # Patch : intercept a package name here and split it
 239         # into keyword 'package' and name
 240         if ( $type eq 'i' && $token =~ /^(package\s+)(\w.*)$/ ) {
 241             $token = $self->markup_xml_element( $1, 'k' );
 242             push @marked_tokens, $token;
 243             $token = $2;
 244             $type  = 'i';
 245         }
 246         #-------------------------------------------------------
 247
 248         $token = $self->markup_xml_element( $token, $type );
 249         push @marked_tokens, $token;
 250     }
 251     return \@marked_tokens;
 252 }
 253
 254 sub my_encode_entities {
 255     my ($token) = @_;
 256
 257     # escape any characters not allowed in XML content.
 258     # ??s/\92/&apos;/;
 259     if ($missing_html_entities) {
 260         $token =~ s/\&/&amp;/g;
 261         $token =~ s/\</&lt;/g;
 262         $token =~ s/\>/&gt;/g;
 263         $token =~ s/\"/&quot;/g;
 264     }
 265     else {
 266         HTML::Entities::encode_entities($token);
 267     }
 268     return $token;
 269 }
 270
 271 sub markup_xml_element {
 272     my $self = shift;
 273     my ( $token, $type ) = @_;
 274     if ($token) { $token = my_encode_entities($token) }
 275
 276     # get the short abbreviation for this token type
 277     my $short_name = $token_short_names{$type};
 278     if ( !defined($short_name) ) {
 279         $short_name = "pu";    # punctuation is default
 280     }
 281     $token = qq(<$short_name>) . $token . qq(</$short_name>);
 282     return $token;
 283 }
 284
 285 sub finish_formatting {
 286
 287     # called after last line
 288     my $self = shift;
 289     $self->print("</file>\n");
 290     return;
 291 }