2 # -*- coding: utf-8 -*-
3 # extrace_texi_filenames.py
5 # USAGE: extract_texi_filenames.py [-o OUTDIR] FILES
7 # -o OUTDIR specifies that output files should rather be written in OUTDIR
10 # This script parses the .texi file given and creates a file with the
11 # nodename <=> filename/anchor map.
12 # The idea behind: Unnumbered subsections go into the same file as the
13 # previous numbered section, @translationof gives the original node name,
14 # which is then used for the filename/anchor.
16 # If this script is run on a file texifile.texi, it produces a file
17 # texifile_xref.map with tab-separated entries of the form
18 # NODE\tFILENAME\tANCHOR
19 # Note: The filename does not have any extension appended!
20 # This file can then be used by our texi2html init script to determine
21 # the correct file name and anchor for external refs
30 optlist, args = getopt.getopt (sys.argv[1:],'o:')
38 include_re = re.compile (r'@include ((?!../lily-).*?)\.texi$', re.M)
39 whitespaces = re.compile (r'\s+')
40 section_translation_re = re.compile (r'@(node|(?:unnumbered|appendix)(?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|(?:major|chap|(?:sub){0,2})heading|translationof) (.*?)\n')
42 def expand_includes (m):
43 filepath = os.path.join (os.path.dirname (m.group(0)), m.group(1)) + '.texi'
44 print "Including file: " + filepath
45 if os.path.exists (filepath):
46 return extract_sections (filepath)
49 def extract_sections (filename):
51 f = open (filename, 'r')
54 # Replace all includes by their list of sections and extract all sections
55 page = include_re.sub (expand_includes, page)
56 sections = section_translation_re.findall (page)
58 result += "@" + sec[0] + " " + sec[1] + "\n"
61 # Convert a given node name to its proper file name (normalization as explained
62 # in the texinfo manual:
63 # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
64 def texinfo_file_name(title):
65 # exception: The top node is always mapped to index.html
68 # File name normalization by texinfo (described in the texinfo manual):
69 # 1/2: letters and numbers are left unchanged
70 # 3/4: multiple, leading and trailing whitespace is removed
71 title = title.strip ();
72 title = whitespaces.sub (' ', title)
73 # 5: all remaining spaces are converted to '-'
74 # 6: all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
76 for index in range(len(title)):
78 if char == ' ': # space -> '-'
80 elif ( ('0' <= char and char <= '9' ) or
81 ('A' <= char and char <= 'Z' ) or
82 ('a' <= char and char <= 'z' ) ): # number or letter
87 result += "_%04x" % ccode
89 result += "__%06x" % ccode
90 # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
91 if ord(result[0]) in range (ord('0'), ord('9')):
92 result = 't_g' + result
95 texinfo_re = re.compile (r'@.*{(.*)}')
96 def remove_texinfo (title):
97 return texinfo_re.sub (r'\1', title)
99 def create_texinfo_anchor (title):
100 return texinfo_file_name (remove_texinfo (title))
102 unnumbered_re = re.compile (r'unnumbered.*')
103 def process_sections (filename, page):
104 sections = section_translation_re.findall (page)
105 # TODO: Don't rely on the file having a 4-letter extension (texi)!!!
106 p = os.path.join (outdir, filename) [:-5] + '_xref.map'
112 this_unnumbered = False
115 # Write out the cached values to the file and start a new section:
117 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
118 print (this_title + "\t" + this_filename + "\t" + this_anchor)
119 this_title = remove_texinfo (sec[1])
120 this_anchor = create_texinfo_anchor (sec[1])
121 elif sec[0] == "translationof":
122 anchor = create_texinfo_anchor (sec[1])
123 # If @translationof is used, it gives the original node name, which
124 # we use for the anchor and the file name (if it is a numbered node)
126 if not this_unnumbered:
127 this_filename = anchor
129 # unnumbered nodes use the previously used file name, only numbered
130 # nodes get their own filename!
131 this_unnumbered = unnumbered_re.match (sec[0])
132 if not this_unnumbered:
133 this_filename = this_anchor
136 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
137 print (this_title + "\t" + this_filename + "\t" + this_anchor)
141 for filename in files:
142 print "extract_texi_filenames.py: Processing %s" % filename
143 sections = extract_sections (filename)
144 process_sections (filename, sections)