#! /usr/bin/python import formatter, htmllib import os, sys, re class PyHTMLParser(htmllib.HTMLParser): pages_to_include = set(('whatsnew/index.html', 'tutorial/index.html', 'using/index.html', 'reference/index.html', 'library/index.html', 'howto/index.html', 'extending/index.html', 'c-api/index.html', 'install/index.html', 'distutils/index.html', 'documenting/index.html')) def __init__(self, formatter, basedir, fn, indent, parents=set()): htmllib.HTMLParser.__init__(self, formatter) self.basedir = basedir self.dir, self.fn = os.path.split(fn) self.data = '' self.parents = parents self.link = {} self.indent = indent self.last_indent = indent - 1 self.sub_indent = 0 self.sub_count = 0 self.next_link = False def process_link(self): new_href = os.path.join(self.dir, self.link['href']) text = self.link['text'] indent = self.indent + self.sub_indent if self.last_indent == indent: print '%s' % (' ' * self.last_indent) self.sub_count -= 1 print '%s' % (' ' * indent, new_href, text) self.sub_count += 1 self.last_indent = self.indent + self.sub_indent def start_li(self, attrs): self.sub_indent += 1 self.next_link = True def end_li(self): indent = self.indent + self.sub_indent if self.sub_count > 0: print '%s' % (' ' * self.last_indent) self.sub_count -= 1 self.last_indent -= 1 self.sub_indent -= 1 def start_a(self, attrs): self.link = {} for attr in attrs: self.link[attr[0]] = attr[1] self.data = '' def end_a(self): process = False text = self.data.replace('\t', '').replace('\n', ' ').replace('&', '&').replace('<', '<').replace('>', '>') self.link['text'] = text # handle a tag without href attribute try: href = self.link['href'] except KeyError: return abs_href = os.path.join(self.basedir, href) if abs_href in self.parents: return if href.startswith('..') or href.startswith('http:') \ or href.startswith('mailto:') or href.startswith('news:'): return if href in ('', 'about.html', 'modindex.html', 'genindex.html', 'glossary.html', 'search.html', 'contents.html', 'download.html', 'bugs.html', 'license.html', 'copyright.html'): return if self.link.has_key('class'): if self.link['class'] in ('biglink'): process = True if self.link['class'] in ('reference external'): if self.next_link: process = True next_link = False if process == True: self.process_link() if href in self.pages_to_include: self.parse_file(os.path.join(self.dir, href)) def finish(self): if self.sub_count > 0: print '%s' % (' ' * self.last_indent) def handle_data(self, data): self.data += data def parse_file(self, href): # TODO basedir bestimmen parent = os.path.join(self.basedir, self.fn) self.parents.add(parent) parser = PyHTMLParser(formatter.NullFormatter(), self.basedir, href, self.indent + 1, self.parents) text = file(self.basedir + '/' + href).read() parser.feed(text) parser.finish() parser.close() if parent in self.parents: self.parents.remove(parent) class PyIdxHTMLParser(htmllib.HTMLParser): def __init__(self, formatter, basedir, fn, indent): htmllib.HTMLParser.__init__(self, formatter) self.basedir = basedir self.dir, self.fn = os.path.split(fn) self.data = '' self.link = {} self.indent = indent self.active = False self.indented = False self.nolink = False self.header = '' self.last_letter = 'Z' self.last_text = '' def process_link(self): new_href = os.path.join(self.dir, self.link['href']) text = self.link['text'] if not self.active: return if text.startswith('['): return if self.link.get('rel', None) in ('prev', 'parent', 'next', 'contents', 'index'): return if self.indented: text = self.last_text + ' ' + text else: # Save it in case we need it again self.last_text = re.sub(' \([\w\-\.\s]+\)', '', text) indent = self.indent print '%s' % (' ' * indent, new_href, text) def start_dl(self, attrs): if self.last_text: # Looks like we found the second part to a command self.indented = True def end_dl(self): self.indented = False def start_dt(self, attrs): self.data = '' self.nolink = True def end_dt(self): if not self.active: return if self.nolink == True: # Looks like we found the first part to a command self.last_text = re.sub(' \([\w\-\.\s]+\)', '', self.data) self.nolink = False def start_h2(self, attrs): for k, v in attrs: if k == 'id': self.header = v if v == '_': self.active = True def start_td(self, attrs): self.indented = False self.last_text = '' def start_table(self, attrs): pass def end_table(self): if self.header == self.last_letter: self.active = False def start_a(self, attrs): self.nolink = False self.link = {} for attr in attrs: self.link[attr[0]] = attr[1] self.data = '' def end_a(self): text = self.data.replace('\t', '').replace('\n', ' ').replace('&', '&').replace('<', '<').replace('>', '>') self.link['text'] = text # handle a tag without href attribute try: href = self.link['href'] except KeyError: return self.process_link() def handle_data(self, data): self.data += data def main(): base = sys.argv[1] fn = sys.argv[2] version = sys.argv[3] parser = PyHTMLParser(formatter.NullFormatter(), base, fn, indent=0) print '' print '' % (version, version) print '' parser.parse_file(fn) print '' print '' fn = 'genindex-all.html' parser = PyIdxHTMLParser(formatter.NullFormatter(), base, fn, indent=1) text = file(base + '/' + fn).read() parser.feed(text) parser.close() print '' print '' main()