2 # -*- coding: utf-8 -*-
3 # extract_texi_filenames.py
5 # USAGE: extract_texi_filenames.py [-o OUTDIR] FILES
7 # -o OUTDIR specifies that output files should rather be written in OUTDIR
10 # This script parses the .texi file given and creates a file with the
11 # nodename <=> filename/anchor map.
12 # The idea behind: Unnumbered subsections go into the same file as the
13 # previous numbered section, @translationof gives the original node name,
14 # which is then used for the filename/anchor.
16 # If this script is run on a file texifile.texi, it produces a file
17 # texifile[.LANG].xref-map with tab-separated entries of the form
18 # NODE\tFILENAME\tANCHOR
19 # LANG is the document language in case it's not 'en'
20 # Note: The filename does not have any extension appended!
21 # This file can then be used by our texi2html init script to determine
22 # the correct file name and anchor for external refs
29 options_list, files = getopt.getopt (sys.argv[1:],'o:s:hI:',
30 ['output=', 'split=', 'help', 'include='])
32 help_text = r"""Usage: %(program_name)s [OPTIONS]... TEXIFILE...
33 Extract files names for texinfo (sub)sections from the texinfo files.
36 -h, --help print this help
37 -I, --include=DIRECTORY append DIRECTORY to include search path
38 -o, --output=DIRECTORY write .xref-map files to DIRECTORY
39 -s, --split=MODE split manual according to MODE. Possible values
40 are section and custom (default)
44 sys.stdout.write ( text)
50 for opt in options_list:
53 if o == '-h' or o == '--help':
54 help (help_text % vars ())
55 if o == '-I' or o == '--include':
57 include_path.append (a)
58 elif o == '-o' or o == '--output':
60 elif o == '-s' or o == '--split':
63 raise Exception ('unknown option: ' + o)
66 if not os.path.isdir (outdir):
67 if os.path.exists (outdir):
71 include_re = re.compile (r'@include ((?!../lily-).*?\.i?texi)$', re.M)
72 whitespaces = re.compile (r'\s+')
73 section_translation_re = re.compile ('^@(node|(?:unnumbered|appendix)\
74 (?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|\
75 (?:major|chap|(?:sub){0,2})heading|translationof|lydoctitle) (.*?)\\s*$', re.MULTILINE)
77 def expand_includes (m, filename):
78 filepath = os.path.join (os.path.dirname (filename), m.group(1))
79 if os.path.exists (filepath):
80 return extract_sections (filepath)[1]
82 for directory in include_path:
83 filepath = os.path.join (directory, m.group(1))
84 if os.path.exists (filepath):
85 return extract_sections (filepath)[1]
86 print "Unable to locate include file " + filepath
89 lang_re = re.compile (r'^@documentlanguage (.+)', re.M)
91 def extract_sections (filename):
93 f = open (filename, 'r')
96 # Search document language
97 m = lang_re.search (page)
98 if m and m.group (1) != 'en':
99 lang_suffix = '.' + m.group (1)
102 # Replace all includes by their list of sections and extract all sections
103 page = include_re.sub (lambda m: expand_includes (m, filename), page)
104 sections = section_translation_re.findall (page)
106 result += "@" + sec[0] + " " + sec[1] + "\n"
107 return (lang_suffix, result)
109 # Convert a given node name to its proper file name (normalization as explained
110 # in the texinfo manual:
111 # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
112 def texinfo_file_name(title):
113 # exception: The top node is always mapped to index.html
116 # File name normalization by texinfo (described in the texinfo manual):
117 # 1/2: letters and numbers are left unchanged
118 # 3/4: multiple, leading and trailing whitespace is removed
119 title = title.strip ();
120 title = whitespaces.sub (' ', title)
121 # 5: all remaining spaces are converted to '-'
122 # 6: all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
124 for index in range(len(title)):
126 if char == ' ': # space -> '-'
128 elif ( ('0' <= char and char <= '9' ) or
129 ('A' <= char and char <= 'Z' ) or
130 ('a' <= char and char <= 'z' ) ): # number or letter
135 result += "_%04x" % ccode
137 result += "__%06x" % ccode
138 # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
139 if (result != '') and (ord(result[0]) in range (ord('0'), ord('9'))):
140 result = 't_g' + result
143 texinfo_re = re.compile (r'@.*{(.*)}')
144 def remove_texinfo (title):
145 return texinfo_re.sub (r'\1', title)
147 def create_texinfo_anchor (title):
148 return texinfo_file_name (remove_texinfo (title))
150 unnumbered_re = re.compile (r'unnumbered.+|lydoctitle')
151 file_name_section_level = {
160 'unnumberedsubsec':1,
163 'unnumberedsubsubsec':0,
164 'appendixsubsubsec':0
166 if split in file_name_section_level:
167 splitting_level = file_name_section_level[split]
170 def process_sections (filename, lang_suffix, page):
171 sections = section_translation_re.findall (page)
172 basename = os.path.splitext (os.path.basename (filename))[0]
173 p = os.path.join (outdir, basename) + lang_suffix + '.xref-map'
177 this_filename = 'index'
179 this_unnumbered = False
183 # Write out the cached values to the file and start a new section:
184 if this_title and this_title != 'Top':
185 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
187 this_title = remove_texinfo (sec[1])
188 this_anchor = create_texinfo_anchor (sec[1])
189 elif sec[0] == "translationof":
190 anchor = create_texinfo_anchor (sec[1])
191 # If @translationof is used, it gives the original node name, which
192 # we use for the anchor and the file name (if it is a numbered node)
194 if not this_unnumbered:
195 this_filename = anchor
197 # Some pages might not use a node for every section, so treat this
198 # case here, too: If we already had a section and encounter another
199 # one before the next @node, we write out the old one and start
200 # with the new values
201 if had_section and this_title:
202 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
203 this_title = remove_texinfo (sec[1])
204 this_anchor = create_texinfo_anchor (sec[1])
207 if split == 'custom':
208 # unnumbered nodes use the previously used file name, only numbered
209 # nodes get their own filename! However, top-level @unnumbered
210 # still get their own file.
211 this_unnumbered = unnumbered_re.match (sec[0])
212 if not this_unnumbered:
213 this_filename = this_anchor
214 elif split == 'node':
215 this_filename = this_anchor
217 if sec[0] in file_name_section_level and \
218 file_name_section_level[sec[0]] >= splitting_level:
219 this_filename = this_anchor
221 if this_title and this_title != 'Top':
222 f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
226 for filename in files:
227 print "extract_texi_filenames.py: Processing %s" % filename
228 (lang_suffix, sections) = extract_sections (filename)
229 process_sections (filename, lang_suffix, sections)