#!/usr/bin/perl -w
# Copyright © 2001 Colin Watson
# Copyright © 2008 Jordà Polo
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, you can find it on the World Wide
# Web at http://www.gnu.org/copyleft/gpl.html, or write to the Free
# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301, USA.
# Should be run from the top level of the Lintian source tree or with
# LINTIAN_ROOT set appropriately. You need copies of all the relevant manuals
# installed in the standard places locally (packages debian-policy,
# developers-reference, doc-base, python, lintian, menu and libpkg-guide).
use strict;
use warnings;
use File::Basename;
use POSIX qw(strftime);
BEGIN {
my $LINTIAN_ROOT = $ENV{'LINTIAN_ROOT'};
if (not $LINTIAN_ROOT) {
use Cwd ();
$ENV{'LINTIAN_ROOT'} = $LINTIAN_ROOT = Cwd::cwd();
} else {
chdir $LINTIAN_ROOT or die "Cannot chdir to $LINTIAN_ROOT: $!\n";
}
}
my $LINTIAN_ROOT = $ENV{'LINTIAN_ROOT'};
# For each manual, we need:
# * Location of the manual index on the local filesystem
# * Base URL for the eventual target of the reference (or empty string if no
# public URL is available)
# * Regex to match the possible references
# * Mapping from regex fields to reference fields (array of arrays of
# keywords: url, section title; the position of each keyword in the array
# defines which is its corresponding group in the regex)
#
# Optionally, if there are subsections that aren't available in the index, an
# additional regex can be defined to match possible references on other pages
# of the manual.
my $title_re = qr/
(.+?)<\/title\s?>/i;
my $link_re = qr//;
my $index_re = qr/([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)<\/a>/;
my $fields = [ [ 'url' ], [ 'section' ], [ 'title' ] ];
my $dbk_index_re = qr/([\d.]+?)\.\s+([\w\s[:punct:]]+?)<\/a\s*>/i;
my $dbk_fields = [ [ 'section' ], [ 'url' ], [ 'title' ] ];
my %manuals = (
'policy' => [
'/usr/share/doc/debian-policy/policy.html/index.html',
'http://www.debian.org/doc/debian-policy/',
$link_re, $fields
],
'menu-policy' => [
'/usr/share/doc/debian-policy/menu-policy.html/index.html',
'http://www.debian.org/doc/packaging-manuals/menu-policy/',
$link_re, $fields
],
'perl-policy' => [
'/usr/share/doc/debian-policy/perl-policy.html/index.html',
'http://www.debian.org/doc/packaging-manuals/perl-policy/',
$link_re, $fields
],
'python-policy' => [
'/usr/share/doc/python/python-policy.html/index.html',
'http://www.debian.org/doc/packaging-manuals/python-policy/',
$link_re, $fields
],
'java-policy' => [
'/usr/share/doc/java-common/debian-java-policy/index.html',
'http://www.debian.org/doc/packaging-manuals/java-policy/',
$dbk_index_re, $dbk_fields
],
'vim-policy' => [
'/usr/share/doc/vim-doc/vim-policy.html/index.html',
'http://pkg-vim.alioth.debian.org/vim-policy.html/',
$dbk_index_re, $dbk_fields
],
'lintian' => [
'/usr/share/doc/lintian/lintian.html/index.html',
'http://lintian.debian.org/manual/',
$dbk_index_re, $dbk_fields
],
'devref' => [
'/usr/share/doc/developers-reference/index.html',
'http://www.debian.org/doc/developers-reference/',
$index_re, $fields,
qr/<\/a>([\d\.]+?)\.? ([\w\s[:punct:]]+?)<\/h[45]>/
],
'menu' => [
'/usr/share/doc/menu/html/index.html',
'http://www.debian.org/doc/packaging-manuals/menu.html/',
$index_re, $fields
],
'doc-base' => [
'/usr/share/doc/doc-base/doc-base.html/index.html', '',
$index_re, $fields
],
'debconf-spec' => [
'/usr/share/doc/debian-policy/debconf_specification.html',
'http://www.debian.org/doc/packaging-manuals/debconf_specification.html',
qr/([\w\s[:punct:]]+?)<\/a>/,
[ [ 'section', 'url' ], [ 'title' ] ]
],
'fhs' => [
'/usr/share/doc/debian-policy/fhs/fhs-2.3.html',
'http://www.pathname.com/fhs/pub/fhs-2.3.html',
qr/([\w\s[:punct:]]+?)<\/a\s*>/i,
[ [ 'section', 'url' ], [ 'title' ] ]
],
);
# extract_refs -- Extract manual references from HTML file.
#
# This function takes the output file handle, the path to the page, and the
# regex to match, and prints references to stdout. The second argument is used
# to decide whether to look for the title (0) or not (1). It returns a list of
# pages linked by the extracted references.
sub extract_refs {
my ($fh, $manual, $title, $page, $url, $ref_re, $fields) = @_;
my @linked_pages = ();
open(PAGE, '<', $page) or die "Couldn't open $page: $!";
# Read until there are 2 newlines. This hack is needed since some lines in
# the Developer's Reference are cut in the middle of ....
local $/ = "\n\n";
while () {
if (not $title and m/$title_re/) {
$title = 1;
my @out = ( $manual, '', $1, $url );
print $fh join('::', @out) . "\n";
}
while (m/$ref_re/g) {
my %ref;
for (my $i = 0; $i < scalar @{$fields}; $i++) {
foreach my $c (@{$fields->[$i]}) {
my $v = $i + 1;
$ref{$c} = eval '$' . $v;
}
}
if ($ref{url} =~ m/^(.+?\.html)#?/i) {
push(@linked_pages, $1) if not grep(m/$1/, @linked_pages);
}
# If the extracted URL part doesn't look like a URL, assume it is
# an anchor and convert to URL accordingly.
if ($ref{url} and not $ref{url} =~ m/(?:#|\.html$)/i) {
$ref{url} = basename($page) . "#$ref{url}";
}
$ref{section} =~ s/^\#(.+)$/\L$1/;
$ref{title} =~ s/\s+/ /g;
$ref{title} =~ s,]*>(.*?),$1,ig;
$ref{title} =~ s,]*>(.*?)
,$1
,ig;
$ref{url} = "$url$ref{url}";
$ref{url} = '' if not $url;
my @out = ( $manual, $ref{section}, $ref{title}, $ref{url} );
print $fh join('::', @out) . "\n";
}
}
close(PAGE);
return @linked_pages;
}
# Create a new reference file.
open(OUT, '>', 'data/output/manual-references.new')
or die "Cannot create data/output/manual-references.new: $!\n";
my $date = strftime('%Y-%m-%d', localtime);
print OUT <<"HEADER";
# Data about titles, sections, and URLs of manuals, used to expand references
# in tag descriptions and add links for HTML output. Each line of this file
# has four fields separated by double colons:
#
# :: :: ::
#
# If is empty, that line specifies the title and URL for the whole
# manual. If is empty, that manual is not available on the web.
#
# Last updated: $date
HEADER
for my $manual (sort keys %manuals) {
my ($index, $url, $ref_re, $fields, $sub_re) = @{$manuals{$manual}};
if (not -f $index) {
die "Manual '$manual' not installed, aborting.\n";
}
# Extract references from the index.
my @subpages
= extract_refs(\*OUT, $manual, 0, $index, $url, $ref_re, $fields);
# Extract additional subsection references if not available in the index.
next if not $sub_re;
foreach my $pagename (@subpages) {
my $page = dirname($index) . "/$pagename";
extract_refs(\*OUT, $manual, 1, $page, $url, $sub_re, $fields);
}
}
# Replace the old reference file.
close OUT or die "Cannot flush data/output/manual-references.new: $!\n";
rename('data/output/manual-references.new',
'data/output/manual-references')
or die "Cannot rename data/output/manual-references: $!\n";
# Local Variables:
# indent-tabs-mode: nil
# cperl-indent-level: 4
# End:
# vim: syntax=perl sw=4 sts=4 sr et