2 # -*- coding: utf-8 -*-
3 # extract_texi_filenames.py
5 # USAGE: extract_texi_filenames.py [-o OUTDIR] FILES
7 # -o OUTDIR specifies that output files should rather be written in OUTDIR
10 # This script parses the .texi file given and creates a file with the
11 # nodename <=> filename/anchor map.
12 # The idea behind: Unnumbered subsections go into the same file as the
13 # previous numbered section, @translationof gives the original node name,
14 # which is then used for the filename/anchor.
16 # If this script is run on a file texifile.texi, it produces a file
17 # texifile[.LANG].xref-map with tab-separated entries of the form
18 # NODE\tFILENAME\tANCHOR
19 # LANG is the document language in case it's not 'en'
20 # Note: The filename does not have any extension appended!
21 # This file should then be used by our texi2html init script to determine
22 # the correct file name and anchor for external refs
24 # For translated documentation: cross-references to nodes that exist
25 # only in documentation in English are allowed, that's why the already
26 # generated map file of docs in English is loaded with
27 # --master-map-file option, then the node names that are defined in
28 # the map for the manual in English but not in the translated manual
29 # are added to the map for the translated manual.
37 options_list, files = getopt.getopt (sys.argv[1:],'o:s:hI:m:k:q',
41 'known-missing-files=',
44 help_text = r"""Usage: %(program_name)s [OPTIONS]... TEXIFILE...
45 Extract files names for texinfo (sub)sections from the texinfo files.
48 -h, --help print this help
49 -I, --include=DIRECTORY append DIRECTORY to include search path
50 -m, --master-map-file=FILE use FILE as master map file
51 -o, --output=DIRECTORY write .xref-map files to DIRECTORY
52 -s, --split=MODE split manual according to MODE. Possible values
53 are section and custom (default)
54 -k, --known-missing-files a filename which has a list of files known
55 to be missing for this make
56 -q, --quiet suppress most messages
60 sys.stdout.write ( text)
67 known_missing_files = []
68 known_missing_files_file = ''
69 docs_without_directories = ['changes', 'music-glossary']
70 suppress_output = False
72 for opt in options_list:
75 if o == '-h' or o == '--help':
76 help (help_text % vars ())
77 if o == '-I' or o == '--include':
79 include_path.append (a)
81 path_list = a.split('/')
82 file_name = path_list[len(path_list)-1]
83 if not (file_name in docs_without_directories):
84 print a, 'is not a directory.'
85 print 'Please consider adding it to the list of '
86 print 'known missing files in extract_texi_filename.py.'
87 elif o == '-o' or o == '--output':
89 elif o == '-s' or o == '--split':
91 elif o == '-m' or o == '--master-map-file':
92 if os.path.isfile (a):
94 elif o == '--known-missing-files':
95 if os.path.isfile (a):
96 known_missing_files_file = a
98 print 'Missing files list file not found: ', a
99 elif o == '-q' or o == '--quiet':
100 suppress_output = True
102 raise Exception ('unknown option: ' + o)
104 if known_missing_files_file:
105 missing_files = open (known_missing_files_file, 'r')
106 known_missing_files = missing_files.read().splitlines()
107 missing_files.close()
109 if not os.path.isdir (outdir):
110 if os.path.exists (outdir):
114 # Only look at @include if it is not preceeded by a @c:
115 include_re = re.compile (r'^(?!.*@c .*@include)@include ((?!../lily-).*?\.i?te(xi|ly))$', re.M)
116 whitespaces = re.compile (r'\s+')
117 section_translation_re = re.compile ('^@(node|(?:unnumbered|appendix)\
118 (?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|\
119 (?:major|chap|(?:sub){0,2})heading|lydoctitle|translationof|nodeprefix) \
120 (.+)$', re.MULTILINE)
121 external_node_re = re.compile (r'\s+@c\s+external.*')
123 def expand_includes (m, filename):
124 include_name = m.group (1)
125 filepath = os.path.join (os.path.dirname (filename), include_name)
126 if os.path.exists (filepath):
127 return extract_sections (filepath)[1]
129 for directory in include_path:
130 filepath = os.path.join (directory, include_name)
131 if os.path.exists (filepath):
132 return extract_sections (filepath)[1]
133 if not (include_name in known_missing_files):
135 print 'Warning: No such file: ' + include_name + \
136 ' (search path: ' + ':'.join (include_path)+')'
139 lang_re = re.compile (r'^@documentlanguage (.+)', re.M)
141 def extract_sections (filename):
143 f = open (filename, 'r')
146 # Search document language
147 m = lang_re.search (page)
148 if m and m.group (1) != 'en':
149 lang_suffix = '.' + m.group (1)
152 # Replace all includes by their list of sections and extract all sections
153 page = include_re.sub (lambda m: expand_includes (m, filename), page)
154 sections = section_translation_re.findall (page)
156 result += "@" + sec[0] + " " + sec[1] + "\n"
157 return (lang_suffix, result)
159 # Convert a given node name to its proper file name (normalization as
160 # explained in the texinfo manual:
161 # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
162 def texinfo_file_name(title):
163 # exception: The top node is always mapped to index.html
166 # File name normalization by texinfo (described in the texinfo manual):
167 # 1/2: letters and numbers are left unchanged
168 # 3/4: multiple, leading and trailing whitespace is removed
169 title = title.strip ();
170 title = whitespaces.sub (' ', title)
171 # 5: all remaining spaces are converted to '-'
172 # 6: all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
174 for index in range(len(title)):
176 if char == ' ': # space -> '-'
178 elif ( ('0' <= char and char <= '9' ) or
179 ('A' <= char and char <= 'Z' ) or
180 ('a' <= char and char <= 'z' ) ): # number or letter
185 result += "_%04x" % ccode
187 result += "__%06x" % ccode
188 # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
189 if (result != '') and (ord(result[0]) in range (ord('0'), ord('9'))):
190 result = 't_g' + result
193 texinfo_re = re.compile (r'@.*?{(.*?)}')
194 def remove_texinfo (title):
195 title = title.replace ('--', '-')
196 return texinfo_re.sub (r'\1', title).strip ()
198 def create_texinfo_anchor (title):
199 return texinfo_file_name (remove_texinfo (title))
201 unnumbered_re = re.compile (r'unnumbered.+|lydoctitle')
202 file_name_section_level = {
211 'unnumberedsubsec':1,
214 'unnumberedsubsubsec':0,
215 'appendixsubsubsec':0
217 if split in file_name_section_level:
218 splitting_level = file_name_section_level[split]
221 def process_sections (filename, lang_suffix, page):
222 sections = section_translation_re.findall (page)
223 basename = os.path.splitext (os.path.basename (filename))[0]
224 p = os.path.join (outdir, basename) + lang_suffix + '.xref-map'
225 if not suppress_output:
229 node_prefix_title = ''
231 this_filename = 'index'
233 this_unnumbered = False
237 # Write out the cached values to the file and start a new
239 if this_title and this_title != 'Top':
240 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
242 this_title = remove_texinfo (sec[1])
243 this_anchor = create_texinfo_anchor (sec[1])
244 # delete entry from master map file
245 if this_title in initial_map:
246 del initial_map[this_title]
247 elif sec[0] == "translationof":
248 (original_node, external_node) = external_node_re.subn ('', sec[1])
249 original_node = remove_texinfo (original_node)
250 # The following binds the translator to use the
251 # translated node name in cross-references in case
253 if external_node and original_node in initial_map:
254 del initial_map[original_node]
255 anchor = create_texinfo_anchor (sec[1])
256 # If @translationof is used, it gives the original
257 # node name, which we use for the anchor and the file
258 # name (if it is a numbered node)
260 if not this_unnumbered:
261 this_filename = anchor
262 elif original_node in initial_map:
263 this_filename = initial_map[original_node][2]
264 elif sec[0] == "nodeprefix":
265 node_prefix_title = remove_texinfo (sec[1])
266 node_prefix_anchor = create_texinfo_anchor (sec[1])
268 # Some pages might not use a node for every section, so
269 # treat this case here, too: If we already had a section
270 # and encounter another one before the next @node, we
271 # write out the old one and start with the new values
272 if had_section and split != 'node' and this_title:
273 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
274 this_title = remove_texinfo (sec[1])
275 this_anchor = create_texinfo_anchor (sec[1])
278 if sec[0] == "lydoctitle" and node_prefix_title:
279 this_title = "%s: %s" % (node_prefix_title, this_title)
280 this_anchor = "%s-%s" % (node_prefix_anchor, this_anchor)
282 if split == 'custom':
283 # unnumbered nodes use the previously used file name,
284 # only numbered nodes get their own filename! However,
285 # top-level @unnumbered still get their own file.
286 this_unnumbered = unnumbered_re.match (sec[0])
287 if not this_unnumbered:
288 this_filename = this_anchor
289 elif split == 'node':
290 this_filename = this_anchor
292 if sec[0] in file_name_section_level and \
293 file_name_section_level[sec[0]] >= splitting_level:
294 this_filename = this_anchor
296 if this_title and this_title != 'Top':
297 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
299 for node in initial_map:
300 f.write ("\t".join (initial_map[node]) + "\n")
303 xref_map_line_re = re.compile (r'(.*?)\t(.*?)\t(.*?)$')
305 for line in open (master_map_file):
306 m = xref_map_line_re.match (line)
308 initial_map[m.group (1)] = (m.group (1), m.group (2), m.group (3))
310 for filename in files:
311 if not suppress_output:
312 print "extract_texi_filenames.py: Processing %s" % filename
313 (lang_suffix, sections) = extract_sections (filename)
314 process_sections (filename, lang_suffix, sections)