$Title

#!/usr/bin/perl #!/usr/local/bin/perl ##---------------------------------------------------------------------------## ## File: ## @(#) man2html 1.2 97/08/12 12:57:30 @(#) ## Author: ## Earl Hood, ehood@medusa.acs.uci.edu ## Description: ## man2html is a Perl program to convert formatted nroff output ## to HTML. ## ## Recommend command-line options based on platform: ## ## Platform Options ## --------------------------------------------------------------------- ## c2mp ## hp9000s700/800 -leftm 1 -topm 8 ## sun4 -sun ## --------------------------------------------------------------------- ## ##---------------------------------------------------------------------------## ## Copyright (C) 1995-1997 Earl Hood, ehood@medusa.acs.uci.edu ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA ## 02111-1307, USA ##---------------------------------------------------------------------------## package Man2Html; use Getopt::Long; ($PROG = $0) =~ s/.*\///; $VERSION = "3.0.1"; ## Input and outputs filehandles $InFH = \*STDIN unless $InFH; $OutFH = \*STDOUT unless $OutFH; ## Backspace character: Used in overstriking detection *bs = \"\b"; ## Hash of section titles and their HTML tag wrapper. ## This list allows customization of what HTML tag is used for ## a given section head. ## ## The section title can be a regular expression. Therefore, one must ## be careful about quoting special characters. ## %SectionHead = ( '\S.*OPTIONS.*' => '

', 'AUTHORS?' => '

', 'BUGS' => '

', 'COMPATIBILITY' => '

', 'DEPENDENCIES' => '

', 'DESCRIPTION' => '

', 'DIAGNOSTICS' => '

', 'ENVIRONMENT' => '

', 'ERRORS' => '

', 'EXAMPLES' => '

', 'EXTERNAL INFLUENCES' => '

', 'FILES' => '

', 'LIMITATIONS' => '

', 'NAME' => '

', 'NOTES?' => '

', 'OPTIONS' => '

', 'REFERENCES' => '

', 'RETURN VALUE' => '

', 'SECTION.*:' => '

', 'SEE ALSO' => '

', 'STANDARDS CONFORMANCE' => '

', 'STYLE CONVENTION' => '

', 'SYNOPSIS' => '

', 'SYNTAX' => '

', 'WARNINGS' => '

', '\s+Section.*:' => '

', ); ## Fallback tag if above is not found $HeadFallback = '

'; ## Other gobals $Bare = 0; # Skip printing HTML head/foot flag $BTag = 'B'; # Overstrike tag $CgiUrl = ''; # CGI URL expression $Compress = 0; # Do blank line compression flag $K = 0; # Do keyword search processing flag $NoDepage = 0; # Do not strip page information $NoHeads = 0; # Do no header detection flag $SeeAlso = 0; # Do only SEE ALSO xrefs flag $Solaris = 0; # Solaris keyword search processing flag $Sun = 0; # Headers not overstriken flag $Title = ''; # Title $UTag = 'I'; # Underline tag $ftsz = 7; # Bottome margin size $hdsz = 7; # Top margin size $leftm = ''; # Left margin pad $leftmsz = 0; # Left margin size $pgsz = 66; # Size of page size $txsz = 52; # Text body length size ############################################################################# ## Main Block ############################################################################# { if (get_cli_opts()) { if ($K) { man_k(); } else { do_it(); } } else { usage(); } } ############################################################################# ## Subroutines ############################################################################# sub do_it { ## Define while loop and then eval it when used. The reason ## is to avoid the regular expression reevaulation in the ## section head detection code. $doitcode =<<'EndOfDoItCode'; my($line, $tmp, $i, $head, $preindent, $see_also, $do); $see_also = !$SeeAlso; print $OutFH "\n"; LOOP: while(!eof($InFH)) { $blank = 0; for ($i=0; $i < $hdsz; $i++) { last LOOP unless defined($_ = <$InFH>); } for ($i=0; $i < $txsz; $i++) { last LOOP unless defined($_ = <$InFH>); ## Check if compress consecutive blank lines if ($Compress and !/\S/) { if ($blank) { next; } else { $blank = 1; } } else { $blank = 0; } ## Try to check if line space is needed at page boundaries ## if (!$NoDepage && ($i==0 || $i==($txsz-1)) && !/^\s*$/) { /^(\s*)/; $tmp = length($1); if ($do) { if ($tmp < $preindent) { print $OutFH "\n"; } } else { $do = 1; } $preindent = $tmp; } else { $do = 0; $preindent = 0; } ## Interpret line $line = $_; entitize(\$_); # Convert [$<>] to entity references ## Check for 'SEE ALSO' link only if (!$see_also && $CgiUrl && $SeeAlso) { ($tmp = $line) =~ s/.\010//go; if ($tmp =~ /^\s*SEE\s+ALSO\s*$/o) { $see_also = 1; } else { $see_also = 0; } } ## Create anchor links for manpage references s/((((.\010)+)?[\+_\.\w-])+$((.\010)+)? \d((.\010)+)?\w?$) /make_xref($1) /geox if $see_also; ## Emphasize underlined words # s/((_\010[^_])+[\._]?(_\010[^_])+\)?)/emphasize($1)/oge; # s/((_\010[^_])+([\._]?(_\010[^_])+)?)/emphasize($1)/oge; # # The previous expressions were trying to be clever about # detecting underlined text which contain non-alphanumeric # characters. nroff will not underline non-alphanumeric # characters in an underlined phrase, and the above was trying # to detect that. It does not work all the time, and it # screws up other text, so a simplified expression is used. s/((_\010[^_])+)/emphasize($1)/oge; $secth = 0; ## Check for strong text and headings if ($Sun || /.\010./o) { if (!$NoHeads) { $line =~ s/.\010//go; $tmp = $HeadFallback; EndOfDoItCode ## Create switch statement for detecting a heading ## $doitcode .= "HEADSW: {\n"; foreach $head (keys %SectionHead) { $doitcode .= join("", "\$tmp = '$SectionHead{$head}', ", "\$secth = 1, last HEADSW ", "if \$line =~ /^$leftm$head/o;\n"); } $doitcode .= "}\n"; ## Rest of routine ## $doitcode .=<<'EndOfDoItCode'; if ($secth || $line =~ /^$leftm\S/o) { chop $line; $_ = $tmp . $line . $tmp; s%<([^>]*)>$%%; $_ = "\n\n" . $_ . "

\n";
		    } else {
			s/(((.\010)+.)+)/strongize($1)/oge;
		    }
		} else {
		    s/(((.\010)+.)+)/strongize($1)/oge;
		}
	    }
	    print $OutFH $_;
	}

	for ($i=0; $i < $ftsz; $i++) {
	    last LOOP  unless defined($_ = <$InFH>);
	}
    }
EndOfDoItCode


    ##	Perform processing.

    printhead()  unless $Bare;
    print $OutFH "\n";
    eval $doitcode;			# $doitcode defined above
    print $OutFH "\n";
    printtail()  unless $Bare;
}

##---------------------------------------------------------------------------
##
sub get_cli_opts {
    return 0  unless
    GetOptions(
	"bare",		# Leave out HTML, HEAD, BODY tags.
	"belem=s",	# HTML Element for overstriked text (def: "B")
	"botm=i",	# Number of lines for bottom margin (def: 7)
	"cgiurl=s",	# CGI URL for linking to other manpages
	"cgiurlexp=s",	# CGI URL Perl expr for linking to other manpages
	"compress",	# Compress consecutive blank lines
	"headmap=s",	# Filename of user section head map file
	"k",		# Process input from 'man -k' output.
	"leftm=i",	# Character width of left margin (def: 0)
	"nodepage",	# Do not remove pagination lines
	"noheads",	# Do not detect for section heads
	"pgsize=i",	# Number of lines in a page (def: 66)
	"seealso",	# Link to other manpages only in the SEE ALSO section
	"solaris",	# Parse 'man -k' output from a solaris system
	"sun",		# Section heads are not overstriked in input
	"title=s",	# Title of manpage (def: Not defined)
	"topm=i",	# Number of lines for top margin (def: 7)
	"uelem=s",	# HTML Element for underlined text (def: "I")

	"help"		# Short usage message
    );
    return 0  if defined($opt_help);

    $pgsz = $opt_pgsize || $pgsz;
    if (defined($opt_nodepage)) {
	$hdsz   = 0;
	$ftsz   = 0;
    } else {
	$hdsz   = $opt_topm  if defined($opt_topm);
	$ftsz   = $opt_botm  if defined($opt_botm);
    }
    $txsz       = $pgsz - ($hdsz + $ftsz);
    $leftmsz    = $opt_leftm  if defined($opt_leftm);
    $leftm      = ' ' x $leftmsz;

    $Bare       = defined($opt_bare);
    $Compress   = defined($opt_compress);
    $K          = defined($opt_k);
    $NoDepage   = defined($opt_nodepage);
    $NoHeads    = defined($opt_noheads);
    $SeeAlso    = defined($opt_seealso);
    $Solaris    = defined($opt_solaris);
    $Sun        = defined($opt_sun);

    $Title      = $opt_title || $Title;
    $CgiUrl     = $opt_cgiurlexp ||
			($opt_cgiurl ? qq{return "$opt_cgiurl"} : '');

    $BTag	= $opt_belem || $BTag;
    $UTag	= $opt_uelem || $UTag;
    $BTag	=~ s/[<>]//g;
    $UTag	=~ s/[<>]//g;

    if (defined($opt_headmap)) {
	require $opt_headmap or warn "Unable to read $opt_headmap\n";
    }
    1;
}

##---------------------------------------------------------------------------
sub printhead {
    print $OutFH "\n";
    print $OutFH "\n",
		 "$Title\n",
		 "\n"  if $Title;
    print $OutFH "\n";
    print $OutFH "$Title\n",
		 "\n"  if $Title;
}

##---------------------------------------------------------------------------
sub printtail {
    print $OutFH <

Man(1) output converted with
man2html



EndOfRef
}

##---------------------------------------------------------------------------
sub emphasize {
    my($txt) = shift;
    $txt =~ s/.\010//go;
    $txt = "<$UTag>$txt";
    $txt;
}

##---------------------------------------------------------------------------
sub strongize {
    my($txt) = shift;
    $txt =~ s/.\010//go;
    $txt = "<$BTag>$txt";
    $txt;
}

##---------------------------------------------------------------------------
sub entitize {
    my($txt) = shift;

    ## Check for special characters in overstrike text ##
    $$txt =~ s/_\010\&/strike('_', '&')/geo;
    $$txt =~ s/_\010/strike('_', '>')/geo;

    $$txt =~ s/(\&\010)+\&/strike('&', '&')/geo;
    $$txt =~ s/(<\010)+\010)+>/strike('>', '>')/geo;

    ## Check for special characters in regular text.  Must be careful
    ## to check before/after character in expression because it might be
    ## a special character.
    $$txt =~ s/([^\010]\&[^\010])/htmlize2($1)/geo;
    $$txt =~ s/([^\010]<[^\010])/htmlize2($1)/geo;
    $$txt =~ s/([^\010]>[^\010])/htmlize2($1)/geo;
}

##---------------------------------------------------------------------------
##	escape special characters in a string, in-place
##
sub htmlize {
    my($str) = shift;
    $$str =~ s/&/\&/g;
    $$str =~ s//\>/g;
    $$str;
}

##---------------------------------------------------------------------------
##	htmlize2() is used by entitize.
##
sub htmlize2 {
    my($str) = shift;
    $str =~ s/&/\&/g;
    $str =~ s//\>/g;
    $str;
}

##---------------------------------------------------------------------------
##	strike converts HTML special characters in overstriked text
##	into entity references.  The entities are overstriked so
##	strongize() and emphasize() will recognize the entity to be
##	wrapped in tags.
##
sub strike {
    my($w, $char) = @_;
    my($ret);
    if ($w eq '_') {
	if ($char eq '&') {
	    $ret = "_$bs\&_${bs}a_${bs}m_${bs}p_${bs};";
	} elsif ($char eq '<') {
	    $ret = "_$bs\&_${bs}l_${bs}t_${bs};";
	} elsif ($char eq '>') {
	    $ret = "_$bs\&_${bs}g_${bs}t_${bs};";
	} else {
	    warn qq|Unrecognized character, "$char", passed to strike()\n|;
	}
    } else {
	if ($char eq '&') {
	    $ret = "\&$bs\&a${bs}am${bs}mp${bs}p;${bs};";
	} elsif ($char eq '<') {
	    $ret = "\&$bs\&l${bs}lt${bs}t;${bs};";
	} elsif ($char eq '>') {
	    $ret = "\&$bs\&g${bs}gt${bs}t;${bs};";
	} else {
	    warn qq|Unrecognized character, "$char", passed to strike()\n|;
	}
    }
    $ret;
}

##---------------------------------------------------------------------------
##	make_xref() converts a manpage crossreference into a hyperlink.
##
sub make_xref {
    my $str = shift;
    $str =~ s/.\010//go;			# Remove overstriking

    if ($CgiUrl) {
	my($title,$section,$subsection) =
	    ($str =~ /([\+_\.\w-]+)\((\d)(\w?)\)/);

	$title =~ s/\+/%2B/g;
	my($href) = (eval $CgiUrl);
	qq|$str|;
    } else {
	qq|$str|;
    }
}

##---------------------------------------------------------------------------
##	man_k() process a keyword search.  The problem we have is there
##	is no standard for keyword search results from man.  Solaris
##	systems have a different enough format to warrent dealing
##	with it as a special case.  For other cases, we try our best.
##	Unfortunately, there are some lines of results that may be
##	skipped.
##
sub man_k {
    my($line,$refs,$section,$subsection,$desc,$i,
       %Sec1, %Sec1sub, %Sec2, %Sec2sub, %Sec3, %Sec3sub,
       %Sec4, %Sec4sub, %Sec5, %Sec5sub, %Sec6, %Sec6sub,
       %Sec7, %Sec7sub, %Sec8, %Sec8sub, %Sec9, %Sec9sub,
       %SecN, %SecNsub, %SecNsec);

    printhead()  unless $Bare;
    print $OutFH "\n";

    while ($line = <$InFH>) {
	next if $line !~ /\(\d\w?\)\s+-\s/; # check if line can be handled
	($refs,$section,$subsection,$desc) =
	    $line =~ /^\s*(.*)\((\d)(\w?)\)\s*-\s*(.*)$/;

	if ($Solaris) {
	    $refs =~ s/^\s*([\+_\.\w-]+)\s+([\+_\.\w-]+)\s*$/$1/;
					#   
	} else {
	    $refs =~ s/\s(and|or)\s/,/gi; # Convert and/or to commas
	    $refs =~ s/^[^:\s]:\s*//;	# Remove prefixed whatis path
	}
	$refs =~ s/\s//g;		# Remove all whitespace
	$refs =~ s/,/, /g;		# Put space after comma
	htmlize(\$desc);		# Check for special chars in desc
	$desc =~ s/^(.)/\U$1/;		# Uppercase first letter in desc

	if ($section eq '1') {
	    $Sec1{$refs} = $desc; $Sec1sub{$refs} = $subsection;
	} elsif ($section eq '2') {
	    $Sec2{$refs} = $desc; $Sec2sub{$refs} = $subsection;
	} elsif ($section eq '3') {
	    $Sec3{$refs} = $desc; $Sec3sub{$refs} = $subsection;
	} elsif ($section eq '4') {
	    $Sec4{$refs} = $desc; $Sec4sub{$refs} = $subsection;
	} elsif ($section eq '5') {
	    $Sec5{$refs} = $desc; $Sec5sub{$refs} = $subsection;
	} elsif ($section eq '6') {
	    $Sec6{$refs} = $desc; $Sec6sub{$refs} = $subsection;
	} elsif ($section eq '7') {
	    $Sec7{$refs} = $desc; $Sec7sub{$refs} = $subsection;
	} elsif ($section eq '8') {
	    $Sec8{$refs} = $desc; $Sec8sub{$refs} = $subsection;
	} elsif ($section eq '9') {
	    $Sec9{$refs} = $desc; $Sec9sub{$refs} = $subsection;
	} else {			# Catch all
	    $SecN{$refs} = $desc; $SecNsec{$refs} = $section;
	    $SecNsub{$refs} = $subsection;
	}
    }
    print_mank_sec(\%Sec1, 1, \%Sec1sub);
    print_mank_sec(\%Sec2, 2, \%Sec2sub);
    print_mank_sec(\%Sec3, 3, \%Sec3sub);
    print_mank_sec(\%Sec4, 4, \%Sec4sub);
    print_mank_sec(\%Sec5, 5, \%Sec5sub);
    print_mank_sec(\%Sec6, 6, \%Sec6sub);
    print_mank_sec(\%Sec7, 7, \%Sec7sub);
    print_mank_sec(\%Sec8, 8, \%Sec8sub);
    print_mank_sec(\%Sec9, 9, \%Sec9sub);
    print_mank_sec(\%SecN, 'N', \%SecNsub, \%SecNsec);

    printtail()  unless $Bare;
}
##---------------------------------------------------------------------------
##	print_mank_sec() prints out manpage cross-refs of a specific section.
##
sub print_mank_sec {
    my($sec, $sect, $secsub, $secsec) = @_;
    my(@array, @refs, $href, $item, $title, $subsection, $i, $section,
       $xref);
    $section = $sect;

    @array = sort keys %$sec;
    if ($#array >= 0) {
	print $OutFH "Section $section\n",
		     "\n";
	foreach $item (@array) {
	    @refs = split(/,/, $item);
	    $section = $secsec->{$item}  if $sect eq 'N';
	    $subsection = $secsub->{$item};
	    if ($CgiUrl) {
		($title = $refs[0]) =~ s/\(\)//g;  # watch out for extra ()'s
		$xref = eval $CgiUrl;
	    }
	    print $OutFH "\n";
	    $i = 0;
	    foreach (@refs) {
		if ($CgiUrl) {
		    print $OutFH qq|$_|;
		} else {
		    print $OutFH $_;
		}
		print $OutFH ", "  if $i < $#refs;
		$i++;
	    }
	    print $OutFH " ($section$subsection)\n",
			 "
\n",
			 $sec->{$item}, "\n";
	}
	print $OutFH "\n";
    }
}

##---------------------------------------------------------------------------
##
sub usage {
    print $OutFH < outfile
Options:
  -bare            : Do not put in HTML, HEAD, BODY tags
  -belem     : HTML Element for overstriked text (def: "B")
  -botm <#>        : Number of lines for bottom margin (def: 7)
  -cgiurl     : URL for linking to other manpages
  -cgiurlexp  : Perl expression URL for linking to other manpages
  -compress        : Compress consective blank lines
  -headmap   : Filename of user section head map file
  -help            : This message
  -k               : Process a keyword search result
  -leftm <#>       : Character width of left margin (def: 0)
  -nodepage        : Do not remove pagination lines
  -noheads         : Turn off section head detection
  -pgsize <#>      : Number of lines in a page (def: 66)
  -seealso         : Link to other manpages only in the SEE ALSO section
  -solaris         : Process keyword search result in Solaris format
  -sun             : Section heads are not overstriked in input
  -title   : Title of manpage (def: Not defined)
  -topm <#>        : Number of lines for top margin (def: 7)
  -uelem     : HTML Element for underlined text (def: "I")

Description:
  $PROG takes formatted manpages from STDIN and converts it to HTML sent
  to STDOUT.  The -topm and -botm arguments are the number of lines to the
  main body text and NOT to the running headers/footers.

Version:
  $VERSION
  Copyright (C) 1995-1997  Earl Hood, ehood\@medusa.acs.uci.edu
  $PROG comes with ABSOLUTELY NO WARRANTY and $PROG may be copied only
  under the terms of the GNU General Public License, which may be found in
  the $PROG distribution.

EndOfUsage
    exit 0;
}