4 # Convert a perl script into an xml file
7 # perlxmltok myfile.pl >myfile.xml
8 # perlxmltok <myfile.pl >myfile.xml
10 # The script is broken at the line and token level.
12 # This file is one of the examples distributed with perltidy and demonstrates
13 # using a callback object with Perl::Tidy to walk through a perl file and
14 # process its tokens. It may or may not have any actual usefulness. You can
15 # modify it to suit your own purposes; see sub get_line().
23 usage: perlxmltok filename >outfile
25 getopts('h') or die "$usage";
26 if ($opt_h) {die $usage}
34 $fh = IO::File->new( $file, 'r' );
35 unless ($fh) { die "cannot open '$file': $!\n" }
41 my $formatter = Perl::Tidy::XmlWriter->new($file);
44 # start perltidy, which will start calling our write_line()
46 'formatter' => $formatter, # callback object
48 'destination' => \$dest, # not really needed
49 'argv' => "-npro -se", # dont need .perltidyrc
53 die "Error calling perltidy\n";
57 #####################################################################
59 # The Perl::Tidy::XmlWriter class writes a copy of the input stream in xml
61 #####################################################################
63 package Perl::Tidy::XmlWriter;
70 $missing_html_entities
73 # replace unsafe characters with HTML entity representation if HTML::Entities
75 { eval "use HTML::Entities"; $missing_html_entities = $@; }
79 my ( $class, $input_file ) = @_;
80 my $self = bless { }, $class;
82 $self->print( <<"HEADER");
83 <?xml version = "1.0"?>
86 unless ( !$input_file || $input_file eq '-' || ref($input_file) ) {
88 $self->print( <<"COMMENT");
89 <!-- created by perltidy from file: $input_file -->
93 $self->print("<file>\n");
98 my ( $self, $line ) = @_;
104 # This routine will be called once perl line by perltidy
106 my ($line_of_tokens) = @_;
107 my $line_type = $line_of_tokens->{_line_type};
108 my $input_line = $line_of_tokens->{_line_text};
109 my $line_number = $line_of_tokens->{_line_number};
111 $self->print(" <line type='$line_type'>\n");
112 $self->print(" <text>\n");
114 $input_line = my_encode_entities($input_line);
115 $self->print("$input_line\n");
116 $self->print(" </text>\n");
118 # markup line of code..
119 if ( $line_type eq 'CODE' ) {
121 my $rtoken_type = $line_of_tokens->{_rtoken_type};
122 my $rtokens = $line_of_tokens->{_rtokens};
124 if ( $input_line =~ /(^\s*)/ ) {
130 my $rmarked_tokens = $self->markup_tokens( $rtokens, $rtoken_type );
131 $xml_line .= join '', @$rmarked_tokens;
133 $self->print(" <tokens>\n");
134 $self->print("$xml_line\n");
135 $self->print(" </tokens>\n");
138 $self->print(" </line>\n");
143 # This is the official list of tokens which may be identified by the
144 # user. Long names are used as getopt keys. Short names are
145 # convenient short abbreviations for specifying input. Short names
146 # somewhat resemble token type characters, but are often different
147 # because they may only be alphanumeric, to allow command line
148 # input. Also, note that because of case insensitivity of xml,
149 # this table must be in a single case only (I've chosen to use all
151 # When adding NEW_TOKENS: update this hash table
152 # short names => long names
153 %short_to_long_names = (
164 'pu' => 'punctuation',
167 'h' => 'here-doc-target',
168 'hh' => 'here-doc-text',
175 # Now we have to map actual token types into one of the above short
176 # names; any token types not mapped will get 'punctuation'
179 # The values of this hash table correspond to the keys of the
180 # previous hash table.
181 # The keys of this hash table are token types and can be seen
182 # by running with --dump-token-types (-dtt).
184 # When adding NEW_TOKENS: update this hash table
185 # $type => $short_name
186 %token_short_names = (
211 # These token types will all be called identifiers for now
212 # FIXME: need to separate user defined modules as separate type
213 my @identifier = qw" i t U C Y Z G :: ";
214 @token_short_names{@identifier} = ('i') x scalar(@identifier);
216 # These token types will be called 'structure'
217 my @structure = qw" { } ";
218 @token_short_names{@structure} = ('s') x scalar(@structure);
224 my ( $rtokens, $rtoken_type ) = @_;
225 my ( @marked_tokens, $j, $string, $type, $token );
227 for ( $j = 0 ; $j < @$rtoken_type ; $j++ ) {
228 $type = $$rtoken_type[$j];
229 $token = $$rtokens[$j];
231 #-------------------------------------------------------
232 # Patch : intercept a sub name here and split it
233 # into keyword 'sub' and sub name
234 if ( $type eq 'i' && $token =~ /^(sub\s+)(\w.*)$/ ) {
235 $token = $self->markup_xml_element( $1, 'k' );
236 push @marked_tokens, $token;
241 # Patch : intercept a package name here and split it
242 # into keyword 'package' and name
243 if ( $type eq 'i' && $token =~ /^(package\s+)(\w.*)$/ ) {
244 $token = $self->markup_xml_element( $1, 'k' );
245 push @marked_tokens, $token;
249 #-------------------------------------------------------
251 $token = $self->markup_xml_element( $token, $type );
252 push @marked_tokens, $token;
254 return \@marked_tokens;
257 sub my_encode_entities {
260 # escape any characters not allowed in XML content.
262 if ($missing_html_entities) {
263 $token =~ s/\&/&/g;
264 $token =~ s/\</</g;
265 $token =~ s/\>/>/g;
266 $token =~ s/\"/"/g;
269 HTML::Entities::encode_entities($token);
274 sub markup_xml_element {
276 my ( $token, $type ) = @_;
277 if ($token) { $token = my_encode_entities($token) }
279 # get the short abbreviation for this token type
280 my $short_name = $token_short_names{$type};
281 if ( !defined($short_name) ) {
282 $short_name = "pu"; # punctuation is default
284 $token = qq(<$short_name>) . $token . qq(</$short_name>);
288 sub finish_formatting {
290 # called after last line
292 $self->print("</file>\n");