1 | #!/usr/bin/python |
---|
2 | # |
---|
3 | # makeman -- compile netpbm's stereotyped HTML to troff markup |
---|
4 | # |
---|
5 | # This approach works because we control the entire document universe |
---|
6 | # this is going to convert and can reinforce useful stereotypes. |
---|
7 | # |
---|
8 | # The output of this tool uses cliches parseable by doclifter, |
---|
9 | # which should thus be able to recover all the semantic information |
---|
10 | # it looks like this thing is losing. |
---|
11 | # |
---|
12 | # Known bugs: |
---|
13 | # * Ordered lists are smashed into unordered lists |
---|
14 | # |
---|
15 | # Limitations: |
---|
16 | # * IMG tags are issued as .IMG preceded by a bolded caption containing |
---|
17 | # the alt content. This will only work if the page is formatted with |
---|
18 | # mwww macros. |
---|
19 | # * Loses summary information from tables. |
---|
20 | # * Only permits one <HR> in the HTML, right before the index. |
---|
21 | # |
---|
22 | # You can use the <?makeman ?> PI to pass text directly through to the |
---|
23 | # generated manual page, A major use is to insert format lines for tables. |
---|
24 | # |
---|
25 | # By Eric S. Raymond <esr@thyrsus.com> |
---|
26 | # Version 1.0, July 26 2004 |
---|
27 | # |
---|
28 | # Modified by Akira F. Urushibata <afu@wta.att.ne.jp> |
---|
29 | # Version 1.1, February 11 2016 |
---|
30 | # |
---|
31 | # Added ability to process — − |
---|
32 | # Added footer message to clarify original source. |
---|
33 | # |
---|
34 | |
---|
35 | import os, sys, exceptions, re |
---|
36 | |
---|
37 | source = "netpbm documentation" |
---|
38 | section = 1 |
---|
39 | |
---|
40 | warning = r'''\ |
---|
41 | .\" This man page was generated by the Netpbm tool 'makeman' from HTML source. |
---|
42 | .\" Do not hand-hack it! If you have bug fixes or improvements, please find |
---|
43 | .\" the corresponding HTML page on the Netpbm website, generate a patch |
---|
44 | .\" against that, and send it to the Netpbm maintainer. |
---|
45 | ''' |
---|
46 | |
---|
47 | footerprefix = '''.SH DOCUMENT SOURCE |
---|
48 | This manual page was generated by the Netpbm tool 'makeman' from HTML |
---|
49 | source. The master documentation is at |
---|
50 | .IP |
---|
51 | .B http://netpbm.sourceforge.net/doc/''' |
---|
52 | |
---|
53 | class LiftException(exceptions.Exception): |
---|
54 | def __init__(self, message, retval=1): |
---|
55 | self.message = message |
---|
56 | self.retval = retval |
---|
57 | |
---|
58 | def makeman(name, file, indoc): |
---|
59 | "Transform a string representing an HTML document into man markup." |
---|
60 | global section, sectmap |
---|
61 | # Dot at left margin confuses troff. |
---|
62 | # This program generates these, |
---|
63 | indoc = indoc.replace("\n.", "\n@%@%@") |
---|
64 | # Protect escapes before we try generating font changes. |
---|
65 | indoc = indoc.replace("\\", r"\e") |
---|
66 | # Header-bashing |
---|
67 | indoc = re.sub('(?i)<!DOCTYPE html[^>]*>', "", indoc) |
---|
68 | indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">', "") |
---|
69 | indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"/>', "") |
---|
70 | indoc = indoc.replace('<?xml version="1.1" encoding="iso-8859-1" ?>\n',"") |
---|
71 | indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml">', "") |
---|
72 | indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">', "") |
---|
73 | indoc = indoc.replace("<HEAD>", "").replace("</HEAD>", "") |
---|
74 | indoc = indoc.replace("<head>", "").replace("</head>", "") |
---|
75 | indoc = re.sub('(?i)<A HREF="#index">Table Of Contents</A>', "", indoc) |
---|
76 | datematch = re.compile("Updated: (.*)\n") |
---|
77 | match = datematch.search(indoc) |
---|
78 | if match: |
---|
79 | date = match.group(1) |
---|
80 | else: |
---|
81 | date = "" |
---|
82 | indoc = datematch.sub("", indoc) |
---|
83 | namematch = re.compile("<H1>(.*)</H1>", re.I) |
---|
84 | match = namematch.search(indoc) |
---|
85 | if match: |
---|
86 | name = match.group(1) |
---|
87 | else: |
---|
88 | name = None |
---|
89 | section = 1 |
---|
90 | meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">') |
---|
91 | match = meta.search(indoc) |
---|
92 | if match: |
---|
93 | section = int(match.group(1)) |
---|
94 | indoc = meta.sub("", indoc) |
---|
95 | else: |
---|
96 | section = sectmap.get(name, 0) |
---|
97 | indoc = namematch.sub("", indoc) |
---|
98 | indoc = re.sub("(?i)<BODY[^>]*>", "", indoc) |
---|
99 | indoc = re.sub("(?i)<HTML>", "", indoc) |
---|
100 | # Remove more superfluous headers |
---|
101 | titlematch = re.compile("<TITLE>(.*)</TITLE>\n+", re.I) |
---|
102 | match = titlematch.search(indoc) |
---|
103 | if match: |
---|
104 | title = match.group(1) |
---|
105 | else: |
---|
106 | title = None |
---|
107 | indoc = titlematch.sub("", indoc) |
---|
108 | indoc = re.sub("(?i)\n*<BR>\n+", "\n", indoc) |
---|
109 | indoc = re.sub("(?i)<BR>", "\n", indoc) |
---|
110 | indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc |
---|
111 | # Literal layout |
---|
112 | indoc = re.sub("(?i)\n *<PRE>", "\n.nf", indoc) |
---|
113 | indoc = re.sub("(?i)\n *</PRE>", "\n.fi", indoc) |
---|
114 | indoc = re.sub("(?i)\n *<BLOCKQUOTE>", "\n.RS", indoc) |
---|
115 | indoc = re.sub("(?i)\n *</BLOCKQUOTE>", "\n.RE", indoc) |
---|
116 | # Highlight processing |
---|
117 | indoc = re.sub("(?i)<B>", r"\\fB", indoc) |
---|
118 | indoc = re.sub("(?i)</B>", r"\\fP", indoc) |
---|
119 | indoc = re.sub("(?i)<EM>", r"\\fI", indoc) |
---|
120 | indoc = re.sub("(?i)</EM>", r"\\fP", indoc) |
---|
121 | indoc = re.sub("(?i)<CITE>", r"\\fI", indoc) |
---|
122 | indoc = re.sub("(?i)</CITE>", r"\\fP", indoc) |
---|
123 | indoc = re.sub("(?i)<I>", r"\\fI", indoc) |
---|
124 | indoc = re.sub("(?i)</I>", r"\\fP", indoc) |
---|
125 | indoc = re.sub("(?i)<TT>", r"\\f(CW", indoc) |
---|
126 | indoc = re.sub("(?i)</TT>", r"\\fP", indoc) |
---|
127 | indoc = re.sub("(?i)<KBD>", r"\\f(CW", indoc) |
---|
128 | indoc = re.sub("(?i)</KBD>", r"\\fP", indoc) |
---|
129 | indoc = re.sub("(?i)<CODE>", r"\\f(CW", indoc) |
---|
130 | indoc = re.sub("(?i)</CODE>", r"\\fP", indoc) |
---|
131 | indoc = re.sub("(?i)<STRONG>", r"\\fB", indoc) |
---|
132 | indoc = re.sub("(?i)</STRONG>", r"\\fP", indoc) |
---|
133 | indoc = re.sub("(?i)<SUP>", r"\\u", indoc) |
---|
134 | indoc = re.sub("(?i)</SUP>", r"\\d", indoc) |
---|
135 | # Paragraph handling |
---|
136 | indoc = re.sub("(?i)\n*<P>\n*", r"\n.PP\n", indoc) |
---|
137 | indoc = re.sub("(?i)<br */>", r"\n.PP\n", indoc) |
---|
138 | indoc = re.sub("(?i)</P>", "", indoc) |
---|
139 | indoc = re.sub("(?i)<!--[^>]*-->", "", indoc) |
---|
140 | indoc = re.sub("(?i)<meta[^>]*>", "", indoc) |
---|
141 | lines = indoc.split("\n") |
---|
142 | listdepth = 0 |
---|
143 | for i in range(len(lines)): |
---|
144 | lowered = lines[i].lower() |
---|
145 | if "<dl" in lowered or "<ol" in lowered or "<ul" in lowered: |
---|
146 | listdepth += 1 |
---|
147 | if listdepth: |
---|
148 | lines[i] = lines[i].replace(".PP", ".sp") |
---|
149 | if "</dl>" in lowered or "</ol>" in lowered or "</ul>" in lowered: |
---|
150 | listdepth -= 1 |
---|
151 | indoc = "\n".join(lines) |
---|
152 | indoc = re.sub(r"\s*\.sp", "\n.sp", indoc) |
---|
153 | # Format email addresses as italic |
---|
154 | indoc = re.sub('(?i)<A[ \n]+HREF="mailto:[^>]+">([^<]+)</A>', r'\\fI\1\\fP', indoc) |
---|
155 | # Format manual crossreferences |
---|
156 | def xrefmatch(match): |
---|
157 | xrefto = match.group(2) |
---|
158 | xrefurl = match.group(1) |
---|
159 | xrefsection = sectmap.get(xrefurl, 1) |
---|
160 | if xrefsection == 0: |
---|
161 | return "\n.I " + xrefto |
---|
162 | else: |
---|
163 | return '\n.BR "%s" (%d)\\c\n\\&' % (xrefto, xrefsection) |
---|
164 | indoc = re.sub(r'(?i)\n* *(?:\\fB)?<A[ \n]+HREF="?([^>]+.html)"?>([^<]+)</A>(?:\\fP)?', |
---|
165 | xrefmatch, indoc) |
---|
166 | # Format URLs |
---|
167 | def urlmatch(match): |
---|
168 | url = match.group(1).replace('\n', ' ') |
---|
169 | txt = match.group(2).replace('\n', ' ') |
---|
170 | return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt) |
---|
171 | indoc = re.sub(r'(?i)\n*(?:<)?<A[ \n]+HREF *= *"([^>]+)">([^<]+)</A>(?:>)?', |
---|
172 | urlmatch, indoc) |
---|
173 | # Turn some entities into harmless cookies |
---|
174 | indoc = indoc.replace("<", "@#!#@").replace(">", "#@!@#").replace("&", "#!@!@!#") |
---|
175 | indoc = indoc.replace("×", r"\(mu") |
---|
176 | indoc = indoc.replace("®", r"\*R") |
---|
177 | indoc = indoc.replace("©", r"\(co") |
---|
178 | # Turn anchors into .UN tags |
---|
179 | indoc = re.sub('(?i)<A NAME *= *"#?([a-zA-Z_][a-zA-Z_0-9.-]+)">(?: )*</A>\s*', ".UN \\1\n", indoc) |
---|
180 | # Strip off the index trailer |
---|
181 | trailer = re.compile('<HR */*>.*', re.DOTALL | re.IGNORECASE) |
---|
182 | indoc = re.sub(trailer, "", indoc) |
---|
183 | # If there was no index trailer, we still need to strip these |
---|
184 | indoc = indoc.replace("</BODY>", "").replace("</HTML>", "") |
---|
185 | indoc = indoc.replace("</body>", "").replace("</html>", "") |
---|
186 | # Recognize sections with IDs |
---|
187 | indoc = re.sub('(?i)<H2><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H2>', |
---|
188 | ".UN \\1\n.SH \\2", indoc) |
---|
189 | indoc = re.sub('(?i)<H3><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H3>', |
---|
190 | ".UN \\1\n.SS \\2", indoc) |
---|
191 | indoc = re.sub('(?i)<H4><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H4>', |
---|
192 | ".UN \\1\n.B \\2", indoc) |
---|
193 | indoc = re.sub('(?i)<H2 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H2>', |
---|
194 | ".UN \\1\n.SH \\2", indoc) |
---|
195 | indoc = re.sub('(?i)<H3 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H3>', |
---|
196 | ".UN \\1\n.SS \\2", indoc) |
---|
197 | indoc = re.sub('(?i)<H4 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H4>', |
---|
198 | ".UN \\1\n.B \\2", indoc) |
---|
199 | # Sections without IDs |
---|
200 | indoc = re.sub('(?i)<H2>([^><]*)</H2>', ".SH \\1", indoc) |
---|
201 | indoc = re.sub('(?i)<H3>([^><]*)</H3>', ".SS \\1", indoc) |
---|
202 | indoc = re.sub('(?i)<H4>([^><]*)</H4>', ".B \\1", indoc) |
---|
203 | # |
---|
204 | # Process definition lists -- just turn them into .TPs |
---|
205 | indoc = re.sub("(?i) *<DL *(COMPACT)?>", "", indoc) |
---|
206 | indoc = re.sub("(?i) *</DL>", "", indoc) |
---|
207 | indoc = re.sub("(?i) *<DT>", ".TP\n", indoc) |
---|
208 | indoc = re.sub("(?i) *</DT>", "", indoc) |
---|
209 | indoc = re.sub("(?i)\n*<DD>\n*", "\n", indoc) |
---|
210 | indoc = re.sub("(?i) *</DD>", "", indoc) |
---|
211 | # Process unordered lists -- just turn them into .TPs |
---|
212 | indoc = re.sub("(?i)</?[UO]L *(COMPACT)?>", "", indoc) |
---|
213 | indoc = re.sub("(?i) *<LI>", ".IP \(bu\n", indoc) |
---|
214 | indoc = re.sub("(?i) *</LI>", "", indoc) |
---|
215 | # No-print tags |
---|
216 | indoc = re.sub("<!--no_print-->.*", "", indoc) |
---|
217 | # Passthrough |
---|
218 | indoc = re.sub(r"<\?makeman (.*) \?>", r'\1', indoc) |
---|
219 | # Comments |
---|
220 | indoc = re.sub("<!--([^\n])*-->", r'.\"\1', indoc) |
---|
221 | # Acronyms |
---|
222 | indoc = re.sub('<acronym [a-zA-Z0-9:= \n"]*>', "", indoc) |
---|
223 | indoc = re.sub("</acronym>", "", indoc) |
---|
224 | # Image tags |
---|
225 | indoc = re.sub(' *<img src="([^"]*)" alt="([^"]*)"( *[a-z]*="?[0-9]*"?)*>', ".B \\2\n.IMG -C \\1", indoc) |
---|
226 | # Special characters |
---|
227 | indoc = indoc.replace(""", "'") |
---|
228 | indoc = indoc.replace(" ", "\\ ") |
---|
229 | indoc = indoc.replace("−", "-") |
---|
230 | indoc = indoc.replace("—", "-") |
---|
231 | indoc = indoc.replace("μ", "mu") |
---|
232 | indoc = indoc.replace("σ", "sigma") |
---|
233 | # Tables |
---|
234 | # This will not handle rowspan |
---|
235 | indoc = re.sub('(?i) *<table[^>]*>.*', ".TS", indoc) |
---|
236 | indoc = re.sub("(?i) *</table>.*", ".TE", indoc) |
---|
237 | # First the single-line case |
---|
238 | indoc = re.sub("(?i)</td> *<td>", "\t", indoc) |
---|
239 | indoc = re.sub("(?i)<tr> *<td>", "", indoc) |
---|
240 | indoc = re.sub("(?i)</td> *</tr>", "", indoc) |
---|
241 | # Then the multiline case |
---|
242 | indoc = re.sub(r'(?i)\s*<t[hd][^>]*>([^<\n]*)</t[dh]>\s*', '\t\\1', indoc) |
---|
243 | indoc = re.sub(r'(?i)\s*<t[hd][^>]*>([^<]*)</t[dh]>\s*', '\tT{\n\\1T}', indoc) |
---|
244 | indoc = indoc.replace("\n\\&T}", "\nT}") |
---|
245 | indoc = re.sub("(?i) *</tr>", "", indoc) |
---|
246 | indoc = re.sub("(?i) *<tr[^>]*>\t*", "", indoc) |
---|
247 | indoc = re.sub(r"\.TS\s+<[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>([^<]*)</[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>\s*", ".B \\1\n.TS\n", indoc) |
---|
248 | # Debugging |
---|
249 | #sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date)) |
---|
250 | # Time for error checking now |
---|
251 | badlines = [] |
---|
252 | for line in indoc.split("\n"): |
---|
253 | if "<" in line or ">" in line.replace(" >", "") or re.search(r'(?<!^\\)&.*;', line): |
---|
254 | badlines.append(line) |
---|
255 | if badlines: |
---|
256 | sys.stderr.write(("Bad lines from %s:\n-----------------\n" % file) + "\n".join(badlines) + "\n-----------------\n") |
---|
257 | # Goes after bad-line check so we don't misinterpret it as an error |
---|
258 | indoc = indoc.replace("@#!#@", "<").replace("#@!@#", ">").replace("#!@!@!#", "&") |
---|
259 | indoc = re.sub("\n+$", "\n", indoc) |
---|
260 | # Single-quote at left margin confuses troff. |
---|
261 | # This program never generates these. |
---|
262 | indoc = indoc.replace("\n'", "\n\\&'") |
---|
263 | # Finish guarding against leading dots. |
---|
264 | indoc = indoc.replace("\n@%@%@", "\n\\&.") |
---|
265 | # Mark these generated pages so people won't hand-hack them. |
---|
266 | indoc = warning + indoc |
---|
267 | indoc = indoc + footerprefix + os.path.basename(file) +"\n.PP" |
---|
268 | return indoc |
---|
269 | |
---|
270 | def main(args, mainout=sys.stdout, mainerr=sys.stderr): |
---|
271 | global sectmap |
---|
272 | import getopt |
---|
273 | (options, arguments) = getopt.getopt(args, "vd:") |
---|
274 | dirprefix = "" |
---|
275 | verbosity = 0 |
---|
276 | for (switch, val) in options: |
---|
277 | if switch == '-d': # Set HTML input directory |
---|
278 | dirprefix = val |
---|
279 | elif switch == '-v': # Enable verbose error reporting |
---|
280 | verbosity += 1 |
---|
281 | try: |
---|
282 | # First pass: gather locations for crossreferences: |
---|
283 | sectmap = {} |
---|
284 | for file in arguments: |
---|
285 | try: |
---|
286 | infp = open(os.path.join(dirprefix, file)) |
---|
287 | except: |
---|
288 | sys.stderr.write("makeman: can't open %s\n" % file) |
---|
289 | continue |
---|
290 | indoc = infp.read() |
---|
291 | infp.close() |
---|
292 | namere = re.compile("<H1>(.*)</H1>", re.I) |
---|
293 | namematch = namere.search(indoc) |
---|
294 | titlere = re.compile("<TITLE>(.*)</TITLE>", re.I) |
---|
295 | titlematch = titlere.search(indoc) |
---|
296 | if not namematch: |
---|
297 | raise LiftException("name missing from %s" % file) |
---|
298 | if not titlematch: |
---|
299 | raise LiftException("title missing from %s" % file) |
---|
300 | else: |
---|
301 | title = titlematch.group(1) |
---|
302 | name = titlematch.group(1) |
---|
303 | meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">') |
---|
304 | match = meta.search(indoc) |
---|
305 | if match: |
---|
306 | section = int(match.group(1)) |
---|
307 | sectmap[title] = sectmap[file] = sectmap[name] = section |
---|
308 | else: |
---|
309 | sectmap[title] = sectmap[file] = sectmap[name] = 1 |
---|
310 | hr = re.compile("(?i)<HR>") |
---|
311 | firsthr = hr.search(indoc) |
---|
312 | if firsthr and hr.search(indoc[firsthr.start(0)+4:]): |
---|
313 | LiftException("%s has two <HR> tags!" % file) |
---|
314 | # Second pass: do formatting |
---|
315 | for file in arguments: |
---|
316 | try: |
---|
317 | infp = open(os.path.join(dirprefix, file)) |
---|
318 | except: |
---|
319 | sys.stderr.write("makeman: can't open %s\n" % file) |
---|
320 | continue |
---|
321 | indoc = infp.read() |
---|
322 | infp.close() |
---|
323 | tempfile = file + ".~%s-%d~" % (name, os.getpid()) |
---|
324 | try: |
---|
325 | outfp = open(tempfile, "w") |
---|
326 | except OSError: |
---|
327 | sys.stderr.write("%s: can't open tempfile" % name) |
---|
328 | return True |
---|
329 | try: |
---|
330 | if verbosity: |
---|
331 | sys.stderr.write("makeman: %s\n" % file) |
---|
332 | outdoc = makeman(name, file, indoc) |
---|
333 | except: |
---|
334 | os.remove(tempfile) |
---|
335 | # Pass the exception upwards |
---|
336 | (exc_type, exc_value, exc_traceback) = sys.exc_info() |
---|
337 | raise exc_type, exc_value, exc_traceback |
---|
338 | if outdoc == indoc: |
---|
339 | os.remove(tempfile) |
---|
340 | if outdoc is None: |
---|
341 | continue |
---|
342 | else: |
---|
343 | outfp.write(outdoc) |
---|
344 | outfp.close() # under Windows you can't rename an open file |
---|
345 | stem = file[:file.find(".")] |
---|
346 | os.rename(tempfile, stem + "." + `sectmap[file]`) |
---|
347 | except LiftException, e: |
---|
348 | mainerr.write("makeman: " + e.message + "\n") |
---|
349 | return e.retval |
---|
350 | except IOError, e: |
---|
351 | mainerr.write("makeman: file I/O error: %s\n" % e) |
---|
352 | return 3 |
---|
353 | except KeyboardInterrupt: |
---|
354 | mainerr.write("makeman: bailing out...\n") |
---|
355 | return 4 |
---|
356 | except: |
---|
357 | if verbosity: |
---|
358 | (exc_type, exc_value, exc_traceback) = sys.exc_info() |
---|
359 | raise exc_type, exc_value, exc_traceback |
---|
360 | else: |
---|
361 | mainerr.write("makeman: internal error!\n") |
---|
362 | return 5 |
---|
363 | |
---|
364 | if __name__ == "__main__": |
---|
365 | # Run the main sequence |
---|
366 | raise SystemExit, main(sys.argv[1:]) |
---|
367 | |
---|
368 | # The following sets edit modes for GNU EMACS |
---|
369 | # Local Variables: |
---|
370 | # mode:python |
---|
371 | # End: |
---|