#!/usr/bin/perl -w # Copyright © 2001 Colin Watson # Copyright © 2008 Jordà Polo # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, you can find it on the World Wide # Web at http://www.gnu.org/copyleft/gpl.html, or write to the Free # Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301, USA. # Should be run from the top level of the Lintian source tree or with # LINTIAN_ROOT set appropriately. You need copies of all the relevant manuals # installed in the standard places locally (packages debian-policy, # developers-reference, doc-base, python, lintian, menu and libpkg-guide). use strict; use warnings; use File::Basename; use POSIX qw(strftime); BEGIN { my $LINTIAN_ROOT = $ENV{'LINTIAN_ROOT'}; if (not $LINTIAN_ROOT) { use Cwd (); $ENV{'LINTIAN_ROOT'} = $LINTIAN_ROOT = Cwd::cwd(); } else { chdir $LINTIAN_ROOT or die "Cannot chdir to $LINTIAN_ROOT: $!\n"; } } my $LINTIAN_ROOT = $ENV{'LINTIAN_ROOT'}; # For each manual, we need: # * Location of the manual index on the local filesystem # * Base URL for the eventual target of the reference (or empty string if no # public URL is available) # * Regex to match the possible references # * Mapping from regex fields to reference fields (array of arrays of # keywords: url, section title; the position of each keyword in the array # defines which is its corresponding group in the regex) # # Optionally, if there are subsections that aren't available in the index, an # additional regex can be defined to match possible references on other pages # of the manual. my $title_re = qr/(.+?)<\/title\s?>/i; my $link_re = qr//; my $index_re = qr/([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)<\/a>/; my $fields = [ [ 'url' ], [ 'section' ], [ 'title' ] ]; my $dbk_index_re = qr/([\d.]+?)\.\s+([\w\s[:punct:]]+?)<\/a\s*>/i; my $dbk_fields = [ [ 'section' ], [ 'url' ], [ 'title' ] ]; my %manuals = ( 'policy' => [ '/usr/share/doc/debian-policy/policy.html/index.html', 'http://www.debian.org/doc/debian-policy/', $link_re, $fields ], 'menu-policy' => [ '/usr/share/doc/debian-policy/menu-policy.html/index.html', 'http://www.debian.org/doc/packaging-manuals/menu-policy/', $link_re, $fields ], 'perl-policy' => [ '/usr/share/doc/debian-policy/perl-policy.html/index.html', 'http://www.debian.org/doc/packaging-manuals/perl-policy/', $link_re, $fields ], 'python-policy' => [ '/usr/share/doc/python/python-policy.html/index.html', 'http://www.debian.org/doc/packaging-manuals/python-policy/', $link_re, $fields ], 'java-policy' => [ '/usr/share/doc/java-common/debian-java-policy/index.html', 'http://www.debian.org/doc/packaging-manuals/java-policy/', $dbk_index_re, $dbk_fields ], 'vim-policy' => [ '/usr/share/doc/vim-doc/vim-policy.html/index.html', 'http://pkg-vim.alioth.debian.org/vim-policy.html/', $dbk_index_re, $dbk_fields ], 'lintian' => [ '/usr/share/doc/lintian/lintian.html/index.html', 'http://lintian.debian.org/manual/', $dbk_index_re, $dbk_fields ], 'devref' => [ '/usr/share/doc/developers-reference/index.html', 'http://www.debian.org/doc/developers-reference/', $index_re, $fields, qr/<\/a>([\d\.]+?)\.? ([\w\s[:punct:]]+?)<\/h[45]>/ ], 'menu' => [ '/usr/share/doc/menu/html/index.html', 'http://www.debian.org/doc/packaging-manuals/menu.html/', $index_re, $fields ], 'doc-base' => [ '/usr/share/doc/doc-base/doc-base.html/index.html', '', $index_re, $fields ], 'debconf-spec' => [ '/usr/share/doc/debian-policy/debconf_specification.html', 'http://www.debian.org/doc/packaging-manuals/debconf_specification.html', qr/([\w\s[:punct:]]+?)<\/a>/, [ [ 'section', 'url' ], [ 'title' ] ] ], 'fhs' => [ '/usr/share/doc/debian-policy/fhs/fhs-2.3.html', 'http://www.pathname.com/fhs/pub/fhs-2.3.html', qr/([\w\s[:punct:]]+?)<\/a\s*>/i, [ [ 'section', 'url' ], [ 'title' ] ] ], ); # extract_refs -- Extract manual references from HTML file. # # This function takes the output file handle, the path to the page, and the # regex to match, and prints references to stdout. The second argument is used # to decide whether to look for the title (0) or not (1). It returns a list of # pages linked by the extracted references. sub extract_refs { my ($fh, $manual, $title, $page, $url, $ref_re, $fields) = @_; my @linked_pages = (); open(PAGE, '<', $page) or die "Couldn't open $page: $!"; # Read until there are 2 newlines. This hack is needed since some lines in # the Developer's Reference are cut in the middle of .... local $/ = "\n\n"; while () { if (not $title and m/$title_re/) { $title = 1; my @out = ( $manual, '', $1, $url ); print $fh join('::', @out) . "\n"; } while (m/$ref_re/g) { my %ref; for (my $i = 0; $i < scalar @{$fields}; $i++) { foreach my $c (@{$fields->[$i]}) { my $v = $i + 1; $ref{$c} = eval '$' . $v; } } if ($ref{url} =~ m/^(.+?\.html)#?/i) { push(@linked_pages, $1) if not grep(m/$1/, @linked_pages); } # If the extracted URL part doesn't look like a URL, assume it is # an anchor and convert to URL accordingly. if ($ref{url} and not $ref{url} =~ m/(?:#|\.html$)/i) { $ref{url} = basename($page) . "#$ref{url}"; } $ref{section} =~ s/^\#(.+)$/\L$1/; $ref{title} =~ s/\s+/ /g; $ref{title} =~ s,]*>(.*?),$1,ig; $ref{title} =~ s,]*>(.*?),$1,ig; $ref{url} = "$url$ref{url}"; $ref{url} = '' if not $url; my @out = ( $manual, $ref{section}, $ref{title}, $ref{url} ); print $fh join('::', @out) . "\n"; } } close(PAGE); return @linked_pages; } # Create a new reference file. open(OUT, '>', 'data/output/manual-references.new') or die "Cannot create data/output/manual-references.new: $!\n"; my $date = strftime('%Y-%m-%d', localtime); print OUT <<"HEADER"; # Data about titles, sections, and URLs of manuals, used to expand references # in tag descriptions and add links for HTML output. Each line of this file # has four fields separated by double colons: # # ::
:: :: <url> # # If <section> is empty, that line specifies the title and URL for the whole # manual. If <url> is empty, that manual is not available on the web. # # Last updated: $date HEADER for my $manual (sort keys %manuals) { my ($index, $url, $ref_re, $fields, $sub_re) = @{$manuals{$manual}}; if (not -f $index) { die "Manual '$manual' not installed, aborting.\n"; } # Extract references from the index. my @subpages = extract_refs(\*OUT, $manual, 0, $index, $url, $ref_re, $fields); # Extract additional subsection references if not available in the index. next if not $sub_re; foreach my $pagename (@subpages) { my $page = dirname($index) . "/$pagename"; extract_refs(\*OUT, $manual, 1, $page, $url, $sub_re, $fields); } } # Replace the old reference file. close OUT or die "Cannot flush data/output/manual-references.new: $!\n"; rename('data/output/manual-references.new', 'data/output/manual-references') or die "Cannot rename data/output/manual-references: $!\n"; # Local Variables: # indent-tabs-mode: nil # cperl-indent-level: 4 # End: # vim: syntax=perl sw=4 sts=4 sr et