4 # Convert a perl script into an xml file
7 # perlxmltok myfile.pl >myfile.xml
8 # perlxmltok <myfile.pl >myfile.xml
10 # The script is broken at the line and token level.
12 # This file is one of the examples distributed with perltidy and demonstrates
13 # using a callback object with Perl::Tidy to walk through a perl file and
14 # process its tokens. It may or may not have any actual usefulness. You can
15 # modify it to suit your own purposes; see sub get_line().
23 usage: perlxmltok filename >outfile
25 getopts('h') or die "$usage";
26 if ($opt_h) {die $usage}
34 $fh = IO::File->new( $file, 'r' );
35 unless ($fh) { die "cannot open '$file': $!\n" }
41 my $formatter = Perl::Tidy::XmlWriter->new($file);
44 # start perltidy, which will start calling our write_line()
46 'formatter' => $formatter, # callback object
48 'destination' => \$dest, # not really needed
49 'argv' => "-npro -se", # dont need .perltidyrc
54 #####################################################################
56 # The Perl::Tidy::XmlWriter class writes a copy of the input stream in xml
58 #####################################################################
60 package Perl::Tidy::XmlWriter;
67 $missing_html_entities
70 # replace unsafe characters with HTML entity representation if HTML::Entities
72 { eval "use HTML::Entities"; $missing_html_entities = $@; }
76 my ( $class, $input_file ) = @_;
77 my $self = bless { }, $class;
79 $self->print( <<"HEADER");
80 <?xml version = "1.0"?>
83 unless ( !$input_file || $input_file eq '-' || ref($input_file) ) {
85 $self->print( <<"COMMENT");
86 <!-- created by perltidy from file: $input_file -->
90 $self->print("<file>\n");
95 my ( $self, $line ) = @_;
101 # This routine will be called once perl line by perltidy
103 my ($line_of_tokens) = @_;
104 my $line_type = $line_of_tokens->{_line_type};
105 my $input_line = $line_of_tokens->{_line_text};
106 my $line_number = $line_of_tokens->{_line_number};
108 $self->print(" <line type='$line_type'>\n");
109 $self->print(" <text>\n");
111 $input_line = my_encode_entities($input_line);
112 $self->print("$input_line\n");
113 $self->print(" </text>\n");
115 # markup line of code..
116 if ( $line_type eq 'CODE' ) {
118 my $rtoken_type = $line_of_tokens->{_rtoken_type};
119 my $rtokens = $line_of_tokens->{_rtokens};
121 if ( $input_line =~ /(^\s*)/ ) {
127 my $rmarked_tokens = $self->markup_tokens( $rtokens, $rtoken_type );
128 $xml_line .= join '', @$rmarked_tokens;
130 $self->print(" <tokens>\n");
131 $self->print("$xml_line\n");
132 $self->print(" </tokens>\n");
135 $self->print(" </line>\n");
140 # This is the official list of tokens which may be identified by the
141 # user. Long names are used as getopt keys. Short names are
142 # convenient short abbreviations for specifying input. Short names
143 # somewhat resemble token type characters, but are often different
144 # because they may only be alphanumeric, to allow command line
145 # input. Also, note that because of case insensitivity of xml,
146 # this table must be in a single case only (I've chosen to use all
148 # When adding NEW_TOKENS: update this hash table
149 # short names => long names
150 %short_to_long_names = (
161 'pu' => 'punctuation',
164 'h' => 'here-doc-target',
165 'hh' => 'here-doc-text',
172 # Now we have to map actual token types into one of the above short
173 # names; any token types not mapped will get 'punctuation'
176 # The values of this hash table correspond to the keys of the
177 # previous hash table.
178 # The keys of this hash table are token types and can be seen
179 # by running with --dump-token-types (-dtt).
181 # When adding NEW_TOKENS: update this hash table
182 # $type => $short_name
183 %token_short_names = (
208 # These token types will all be called identifiers for now
209 # FIXME: need to separate user defined modules as separate type
210 my @identifier = qw" i t U C Y Z G :: ";
211 @token_short_names{@identifier} = ('i') x scalar(@identifier);
213 # These token types will be called 'structure'
214 my @structure = qw" { } ";
215 @token_short_names{@structure} = ('s') x scalar(@structure);
221 my ( $rtokens, $rtoken_type ) = @_;
222 my ( @marked_tokens, $j, $string, $type, $token );
224 for ( $j = 0 ; $j < @$rtoken_type ; $j++ ) {
225 $type = $$rtoken_type[$j];
226 $token = $$rtokens[$j];
228 #-------------------------------------------------------
229 # Patch : intercept a sub name here and split it
230 # into keyword 'sub' and sub name
231 if ( $type eq 'i' && $token =~ /^(sub\s+)(\w.*)$/ ) {
232 $token = $self->markup_xml_element( $1, 'k' );
233 push @marked_tokens, $token;
238 # Patch : intercept a package name here and split it
239 # into keyword 'package' and name
240 if ( $type eq 'i' && $token =~ /^(package\s+)(\w.*)$/ ) {
241 $token = $self->markup_xml_element( $1, 'k' );
242 push @marked_tokens, $token;
246 #-------------------------------------------------------
248 $token = $self->markup_xml_element( $token, $type );
249 push @marked_tokens, $token;
251 return \@marked_tokens;
254 sub my_encode_entities {
257 # escape any characters not allowed in XML content.
259 if ($missing_html_entities) {
260 $token =~ s/\&/&/g;
261 $token =~ s/\</</g;
262 $token =~ s/\>/>/g;
263 $token =~ s/\"/"/g;
266 HTML::Entities::encode_entities($token);
271 sub markup_xml_element {
273 my ( $token, $type ) = @_;
274 if ($token) { $token = my_encode_entities($token) }
276 # get the short abbreviation for this token type
277 my $short_name = $token_short_names{$type};
278 if ( !defined($short_name) ) {
279 $short_name = "pu"; # punctuation is default
281 $token = qq(<$short_name>) . $token . qq(</$short_name>);
285 sub finish_formatting {
287 # called after last line
289 $self->print("</file>\n");