#!/usr/bin/perl -w
#
# Lintian HTML reporting tool -- Create Lintian web reports
#
# Copyright (C) 1998 Christian Schwarz and Richard Braakman
# Copyright (C) 2007 Russ Allbery
#
# This program is free software.  It is distributed under the terms of
# the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, you can find it on the World Wide
# Web at http://www.gnu.org/copyleft/gpl.html, or write to the Free
# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301, USA.

use strict;

use File::Copy qw(copy);
use URI::Escape;
use Text::Template ();

# ------------------------------
# Global variables and configuration

# Maximum number of identical tags per package to display.  Any remaining tags
# will be compressed into a "... reported %d more times" line.
our $MAX_TAGS = 8;

# These have no default and must be set in the configuration file.
# FIXME: $statistics_file should be in all caps as well.
our ($LINTIAN_ROOT, $LINTIAN_LAB, $LINTIAN_ARCHIVEDIR, $LINTIAN_DIST,
     $LINTIAN_SECTION, $LINTIAN_ARCH, $HTML_TMP_DIR, $statistics_file,
     $LINTIAN_AREA);

# Read the configuration.
require './config';

if (defined $LINTIAN_SECTION and not defined $LINTIAN_AREA) {
    $LINTIAN_AREA = $LINTIAN_SECTION;
}

# The path to the mirror timestamp.
our $LINTIAN_TIMESTAMP
    = "$LINTIAN_ARCHIVEDIR/project/trace/ftp-master.debian.org";

# FIXME: At least the lab should be a parameter to Read_pkglists rather
# than an environment variable.
$ENV{'LINTIAN_LAB'} = $LINTIAN_LAB;
$ENV{'LINTIAN_ROOT'} = $LINTIAN_ROOT;

# Import Lintian Perl libraries.
use lib "$ENV{LINTIAN_ROOT}/lib";
use Lintian::Tag::Info ();
use Read_pkglists;
use Text_utils;
use Util;

# Global variables from Read_pkglists.  Ugh.
# FIXME: Read_pkglists should return this information instead.
our (%binary_info, %source_info, %udeb_info, %bin_src_ref);

# Get additional tag information.
our %tag_extra = ();

opendir(CHECKDIR, "$LINTIAN_ROOT/checks")
    or fail("cannot read directory $LINTIAN_ROOT/checks");

for my $check (readdir CHECKDIR) {
    next unless $check =~ /\.desc$/;
    my @tags = read_dpkg_control("$LINTIAN_ROOT/checks/$check");
    shift(@tags);
    foreach my $tag (@tags) {
        next unless $tag->{severity} and $tag->{certainty};
        my $name = $tag->{tag};
        $tag_extra{$name}{severity} = $tag->{severity};
        $tag_extra{$name}{certainty} = $tag->{certainty};
    }
}

closedir(CHECKDIR);

# Set the Lintian version, current timestamp, and archive timestamp.
our $LINTIAN_VERSION = `$LINTIAN_ROOT/frontend/lintian --print-version`;
our $timestamp = `date -u --rfc-822`;
our $mirror_timestamp = slurp_entire_file($LINTIAN_TIMESTAMP);
chomp ($LINTIAN_VERSION, $timestamp);
$mirror_timestamp =~ s/\n.*//s;


# ------------------------------
# Initialize templates

# The path to our templates.
our $TEMPLATES = "$LINTIAN_ROOT/reporting/templates";

# This only has to be done once, so do it at the start and then reuse the same
# templates throughout.
our %templates;
for my $template (qw/head foot clean index maintainer maintainers packages tag
                     tags tags-severity/) {
    my %options = (TYPE => 'FILE', SOURCE => "$TEMPLATES/$template.tmpl");
    $templates{$template} = Text::Template->new (%options)
        or die "cannot load template $template: $Text::Template::ERROR\n";
}


# ------------------------------
# Main routine

# Read the package lists.
#
# FIXME: get_bin_src_ref runs read_src_list unconditionally so we can't call
# it directly, which is confusing.
read_bin_list;
read_udeb_list;
get_bin_src_ref;

# Create output directories.
mkdir($HTML_TMP_DIR, 0777)
    or die "cannot create output directory $HTML_TMP_DIR: $!\n";
mkdir("$HTML_TMP_DIR/full", 0777)
    or die "cannot create output directory $HTML_TMP_DIR/full: $!\n";
mkdir("$HTML_TMP_DIR/maintainer", 0777)
    or die "cannot create output directory $HTML_TMP_DIR/maintainer: $!\n";
mkdir("$HTML_TMP_DIR/tags", 0777)
    or die "cannot create output directory $HTML_TMP_DIR/tags: $!\n";
symlink(".", "$HTML_TMP_DIR/reports")
    or die "cannot create symlink $HTML_TMP_DIR/reports: $!\n";
symlink("$LINTIAN_ROOT/doc/lintian.html", "$HTML_TMP_DIR/manual")
    or die "cannot create symlink $HTML_TMP_DIR/manual: $!\n";
if ($ARGV[0]) {
    symlink($ARGV[0], "$HTML_TMP_DIR/lintian.log")
        or die "cannot create symlink $HTML_TMP_DIR/lintian.log: $!\n";
}
copy("$LINTIAN_ROOT/reporting/lintian.css", "$HTML_TMP_DIR/lintian.css")
    or die "cannot copy lintian.css to $HTML_TMP_DIR: $!\n";
for my $image (qw/ico.png l.png logo-small.png/) {
    copy("$LINTIAN_ROOT/reporting/images/$image", "$HTML_TMP_DIR/$image")
        or die "cannot copy images/$image to $HTML_TMP_DIR: $!\n";
}

# This variable will accumulate statistics.  For tags: errors, warnings,
# experimental, overridden, and info are the keys holding the count of tags of
# that sort.  For packages: binary, udeb, and source are the number of
# packages of each type with Lintian errors or warnings.  For maintainers:
# maintainers is the number of maintainers with Lintian errors or warnings.
my %statistics;

# %by_maint holds a hash of maintainer names to packages and tags.  Each
# maintainer is a key.  The value is a hash of package names to hashes.  Each
# package hash is in turn a hash of versions to an anonymous array of hashes,
# with each hash having keys code, package, type, tag, severity, certainty,
# extra, and xref.  xref gets the partial URL of the maintainer page for that
# source package.
#
# In other words, the lintian output line:
#
#     W: gnubg source: substvar-source-version-is-deprecated gnubg-data
#
# for gnubg 0.15~20061120-1 maintained by Russ Allbery <rra@debian.org> is
# turned into the following structure:
#
# { 'gnubg' => {
#       '0.15~20061120-1' => [
#           { code      => 'W',
#             package   => 'gnubg',
#             type      => 'source',
#             tag       => 'substvar-source-version-is-deprecated',
#             severity  => 'normal',
#             certainty => 'certain',
#             extra     => 'gnubg-data'
#             xref      => 'rra@debian.org.html#gnubg' } ] } }
#
# and then stored under the key 'Russ Allbery <rra@debian.org>'
#
# %by_uploader holds the same thing except for packages for which the person
# is only an uploader.
#
# %by_tag is a hash of tag names to an anonymous array of tag information
# hashes just like the inside-most data structure above.
my (%by_maint, %by_uploader, %by_tag);

# We take a lintian log file on either standard input or as the first
# argument.  This log file contains all the tags lintian found, plus N: tags
# with informational messages.  Ignore all the N: tags and load everything
# else into the hashes we use for all web page generation.
#
# We keep track of a hash from maintainer page URLs to maintainer values so
# that we don't have two maintainers who map to the same page and overwrite
# each other's pages.  If we find two maintainers who map to the same URL,
# just assume that the second maintainer is the same as the first (but warn
# about it).
my (%seen, %saw_maintainer);
while (<>) {
    chomp;
    next unless m/^([EWIXO]): (\S+)(?: (\S+))?: (\S+)(?:\s+(.*))?/;
    my ($code, $package, $type, $tag, $extra) = ($1, $2, $3, $4, $5);
    $type = 'binary' unless (defined $type);
    next unless ($type eq 'source' || $type eq 'binary' || $type eq 'udeb');

    # Update statistics.
    my $key = {
        E => 'errors',
        W => 'warnings',
        I => 'info',
        X => 'experimental',
        O => 'overridden'
    }->{$code};
    $statistics{$key}++;
    unless ($seen{"$package $type"}) {
        $statistics{"$type-packages"}++;
        $seen{"$package $type"} = 1;
    }

    # Determine the source package for this package and warn if there appears
    # to be no source package in the archive.  Determine the maintainer and
    # version.  Work around a missing source package by pulling information
    # from a binary package or udeb of the same name if there is any.
    my ($source, $version, $source_version, $maintainer, $uploaders);
    if ($type eq 'source') {
        $source = $package;
        if (exists $source_info{$source}) {
            $version = $source_info{$source}->{version};
            $maintainer = $source_info{$source}->{maintainer};
            $uploaders = $source_info{$source}->{uploaders};
        } else {
            warn "source package $package not listed!\n";
        }
    } else {
        $source = $bin_src_ref{$package};
        if ($source and exists $source_info{$source}) {
            $maintainer = $source_info{$source}->{maintainer};
            $uploaders = $source_info{$source}->{uploaders};
        } else {
            warn "source for package $package not found!\n";
            $source = $package;
            if ($type eq 'binary') {
                $maintainer = $binary_info{$package}->{maintainer};
            } elsif ($type eq 'udeb') {
                $maintainer = $udeb_info{$package}->{maintainer};
            }
        }
        if ($type eq 'binary') {
            $version = $binary_info{$package}->{version};
            $source_version = $binary_info{$package}->{'source-version'};
        } elsif ($type eq 'udeb') {
            $version = $udeb_info{$package}->{version};
            $source_version = $udeb_info{$package}->{'source-version'};
        }
    }
    $maintainer ||= '(unknown)';
    $version ||= 'unknown';
    $source_version ||= $version;

    # Check if we've seen the URL for this maintainer before and, if so, map
    # them to the same person as the previous one.
    $maintainer = map_maintainer ($maintainer);
    $saw_maintainer{$maintainer} = 1;

    # Update maintainer statistics.
    $statistics{maintainers}++ unless defined $by_maint{$maintainer};

    # Sanitize, just out of paranoia.
    $source =~ s/[^a-zA-Z0-9.+-]/_/g;
    $version =~ s/[^a-zA-Z0-9.+:~-]/_/g;

    # Add the tag information to our hashes.  Share the data between the
    # hashes to save space (which means we can't later do destructive tricks
    # with it).
    my $info = {
        code      => html_quote ($code),
        package   => html_quote ($package),
        version   => html_quote ($version),
        type      => html_quote ($type),
        tag       => html_quote ($tag),
        severity  => html_quote ($tag_extra{$tag}{severity}),
        certainty => html_quote ($tag_extra{$tag}{certainty}),
        extra     => html_quote ($extra),
        xref      => maintainer_url ($maintainer) . "#$source"
    };
    $by_maint{$maintainer}{$source}{$source_version} ||= [];
    push(@{ $by_maint{$maintainer}{$source}{$source_version} }, $info);
    $by_tag{$tag} ||= [];
    push(@{ $by_tag{$tag} }, $info);

    # If the package had uploaders listed, also add the information to
    # %by_uploaders (still sharing the data between hashes).
    if ($uploaders) {
        my @uploaders = split (/\s*,\s*/, $uploaders);
        for (@uploaders) {
            my $uploader = map_maintainer ($_);
            next if $uploader eq $maintainer;
            $saw_maintainer{$uploader} = 1;
            $by_uploader{$uploader}{$source}{$source_version} ||= [];
            push(@{ $by_uploader{$uploader}{$source}{$source_version} }, $info);
        }
    }
}

# Build a hash of all maintainers, not just those with Lintian tags.  We use
# this later to generate stub pages for maintainers whose packages are all
# Lintian-clean.
my %clean;
for my $source (keys %source_info) {
    my $maintainer = $source_info{$source}->{maintainer};
    my $id = maintainer_url ($maintainer);
    $clean{$id} = $maintainer;
}

# Now, walk through the tags by source package (sorted by maintainer).  Output
# a summary page of errors and warnings for each maintainer, output a full
# page that includes info, experimental, and overriden tags, and assemble the
# maintainer index and the QA package list as we go.
my (%qa, %maintainers, %packages);
my @maintainers;
{
    my %seen;
    @maintainers =
        sort grep { !$seen{$_}++ } keys (%by_maint), keys (%by_uploader);
}
for my $maintainer (@maintainers) {
    my $id = maintainer_url ($maintainer);
    delete $clean{$id};

    # For each of this maintainer's packages, add statistical information
    # about warnings and errors to the QA list and build the packages hash
    # used for the package index.  We only do this for the maintainer
    # packages, not the uploader packages, to avoid double-counting.
    for my $source (keys %{ $by_maint{$maintainer} }) {
        my ($errors, $warnings) = (0, 0);
        for my $version (keys %{ $by_maint{$maintainer}{$source} }) {
            my $tags = $by_maint{$maintainer}{$source}{$version};
            for my $tag (@$tags) {
                $errors++ if $tag->{code} eq 'E';
                $warnings++ if $tag->{code} eq 'W';
                $packages{$tag->{package}} = $tag->{xref};
            }
        }
        $qa{$source} = [ $errors, $warnings ];
    }

    # Determine if the maintainer's page is clean.  Check all packages for
    # which they're either maintainer or uploader and set $error_clean if
    # they have no errors or warnings.
    my $error_clean = 1;
    for my $source (keys %{ $by_maint{$maintainer} },
                    keys %{ $by_uploader{$maintainer} }) {
        my $versions = $by_maint{$maintainer}{$source}
            || $by_uploader{$maintainer}{$source};
        for my $version (keys %$versions) {
            my $tags = $versions->{$version};
            for my $tag (@$tags) {
                $error_clean = 0 if ($tag->{code} eq 'E');
                $error_clean = 0 if ($tag->{code} eq 'W');
            }
        }
    }

    # Determine the parts of the maintainer and the file name for the
    # maintainer page.
    my ($name, $email) = ($maintainer =~ /^(.*) <([^>]+)>/);
    $name = 'Unknown Maintainer' unless $name;
    $email = 'unknown' unless $email;
    my $regular = "maintainer/$id";
    my $full = "full/$id";

    # Create the regular maintainer page (only errors and warnings) and the
    # full maintainer page (all tags, including overrides and info tags).
    print "Generating page for $id\n";
    my %data = (
        email      => html_quote (uri_escape ($email)),
        errors     => 1,
        id         => $id,
        maintainer => html_quote ($maintainer),
        name       => html_quote ($name),
        packages   => $by_maint{$maintainer},
        uploads    => $by_uploader{$maintainer},
    );
    my $template;
    if ($error_clean) {
        $template = $templates{clean};
    } else {
        $template = $templates{maintainer};
    }
    output_template ($regular, $template, \%data);
    $template = $templates{maintainer};
    $data{errors} = 0;
    output_template ($full, $template, \%data);

    # Add this maintainer to the hash of maintainer to URL mappings.
    $maintainers{$maintainer} = $id;
}

# Write out the maintainer index.
my %data = (
    maintainers => \%maintainers,
);
output_template ('maintainers.html', $templates{maintainers}, \%data);

# Write out the QA package list.
open (QA, '>', "$HTML_TMP_DIR/qa-list.txt")
    or die "cannot create qa-list.txt: $!\n";
for my $source (sort keys %qa) {
    print QA "$source $qa{$source}[0] $qa{$source}[1]\n";
}
close QA or die "cannot write to qa-list: $!\n";

# Now, generate stub pages for every maintainer who has only clean packages.
for my $id (keys %clean) {
    my $maintainer = $clean{$id};
    my ($name, $email) = ($maintainer =~ /^(.*) <([^>]+)>/);
    $email = 'unknown' unless $email;
    my %data = (
        email      => html_quote (uri_escape ($email)),
        maintainer => html_quote ($maintainer),
        name       => html_quote ($name),
    );
    print "Generating clean page for $id\n";
    output_template ("maintainer/$id", $templates{clean}, \%data);
    output_template ("full/$id", $templates{clean}, \%data);
}

# Create the pages for each tag.  Each page shows the extended description for
# the tag and all the packages for which that tag was issued.
for my $tag (sort keys %by_tag) {
    my $info = Lintian::Tag::Info->new($tag);
    my $description;
    if ($info) {
        $description = $info->description('html', '    ');
    } else {
        $description = "    <p>Can't find description of tag $tag.</p>";
    }
    my $code = 'O';
    foreach (@{$by_tag{$tag}}) {
        if ($_->{code} ne 'O') {
            $code = $_->{code};
            last;
        }
    }
    my %data = (
        description => $description,
        tag         => html_quote ($tag),
        code        => $code,
        tags        => $by_tag{$tag},
    );
    output_template ("tags/$tag.html", $templates{tag}, \%data);
}

# Create the general tag indices.
%data = (
    tags      => \%by_tag,
);
output_template ('tags.html', $templates{tags}, \%data);
output_template ('tags-severity.html', $templates{'tags-severity'}, \%data);

# Generate the package lists.  These are huge, so we break them into four
# separate pages.
#
# FIXME: Does anyone actually use these pages?  They're basically unreadable.
my %list;
$list{'0-9, A-F'} = [];
$list{'G-L'}      = [];
$list{'M-R'}      = [];
$list{'S-Z'}      = [];
for my $package (sort keys %packages) {
    my $first = uc substr($package, 0, 1);
    if    ($first le 'F') { push(@{ $list{'0-9, A-F'} }, $package) }
    elsif ($first le 'L') { push(@{ $list{'G-L'} },      $package) }
    elsif ($first le 'R') { push(@{ $list{'M-R'} },      $package) }
    else                  { push(@{ $list{'S-Z'} },      $package) }
}
%data = (
    packages  => \%packages,
);
my $i = 1;
for my $area (sort keys %list) {
    $data{area} = $area;
    $data{list} = $list{$area};
    output_template ("packages_$i.html", $templates{packages}, \%data);
    $i++;
}

# Finally, we can start creating the index page.  First, read in the old
# statistics file so that we can calculate deltas for all of our statistics.
my $old_statistics;
if (-f $statistics_file) {
    ($old_statistics) = read_dpkg_control($statistics_file);
}
my %delta;
my @attrs = qw(maintainers source-packages binary-packages udeb-packages
               errors warnings info experimental overridden);
for my $attr (@attrs) {
    my $old = $old_statistics->{$attr} || 0;
    $statistics{$attr} ||= 0;
    $delta{$attr} = sprintf("%d (%+d)", $statistics{$attr},
                            $statistics{$attr} - $old);
}

# Update the statistics file.
open (STATS, '>', $statistics_file)
    or die "cannot open $statistics_file for writing: $!\n";
print STATS "last-updated: $timestamp\n";
print STATS "mirror-timestamp: $mirror_timestamp\n";
for my $attr (@attrs) {
    print STATS "$attr: $statistics{$attr}\n";
}
print STATS "lintian-version: $LINTIAN_VERSION\n";
close STATS or die "cannot write to $statistics_file: $!\n";

# Create the main page.
%data = (
    architecture => $LINTIAN_ARCH,
    delta        => \%delta,
    dist         => $LINTIAN_DIST,
    mirror       => $mirror_timestamp,
    previous     => $old_statistics->{'last-updated'},
    area         => $LINTIAN_AREA,
);
output_template ('index.html', $templates{index}, \%data);
exit 0;

# ------------------------------
# Utility functions

# Determine the file name for the maintainer page given a maintainer.  It
# should be <email>.html where <email> is their email address with all
# characters other than a-z A-Z 0-9 - _ . @ = + replaced with _.  Don't change
# this without coordinating with QA.
sub maintainer_url {
    my ($maintainer) = @_;
    my ($email) = ($maintainer =~ /<([^>]+)>/);
    my ($regular, $full);
    if ($email) {
        my $id = $email;
        $id =~ tr/a-zA-Z0-9_.@=+-/_/c;
        return "$id.html";
    } else {
        return 'unsorted.html';
    }
}

# Deduplicate maintainers.  Maintains a cache of the maintainers we've seen
# with a given e-mail address, issues a warning if two maintainers have the
# same e-mail address, and returns the maintainer string that we should use
# (which is whatever maintainer we saw first with that e-mail).
{
    my (%urlmap, %warned);
    sub map_maintainer {
        my ($maintainer) = @_;
        my $url = maintainer_url ($maintainer);
        if ($urlmap{$url} && $urlmap{$url} ne $maintainer) {
            warn "$maintainer has the same page as $urlmap{$url}\n"
                unless ($warned{$maintainer}
                        || lc ($maintainer) eq lc ($urlmap{$url})
                        || $maintainer =~ /\@lists\.(alioth\.)?debian\.org>/);
            $warned{$maintainer}++;
            $maintainer = $urlmap{$url};
        } else {
            $urlmap{$url} = $maintainer;
        }
        return $maintainer;
    }
}

# Quote special characters for HTML output.
sub html_quote {
    my ($text) = @_;
    $text ||= '';
    $text =~ s/&/\&amp;/g;
    $text =~ s/</\&lt;/g;
    $text =~ s/>/\&gt;/g;
    return $text;
}

# Given a file name, a template, and a data hash, fill out the template with
# that data hash and output the results to the file.
sub output_template {
    my ($file, $template, $data) = @_;
    $data->{version} ||= $LINTIAN_VERSION;
    $data->{timestamp} ||= $timestamp;
    $data->{head} ||= sub { $templates{head}->fill_in (HASH => { page_title => $_[0],
                                                                 path_prefix => '../' x ($_[1]||0),
                                                                 %$data }) };
    $data->{foot} ||= sub { $templates{foot}->fill_in (HASH => $data) };
    open (OUTPUT, '>', "$HTML_TMP_DIR/$file")
        or die "creating $HTML_TMP_DIR/$file falied: $!\n";
    $template->fill_in (OUTPUT => \*OUTPUT, HASH => $data)
        or die "filling out $file failed: $Text::Template::ERROR\n";
    close OUTPUT;
}

# Local Variables:
# indent-tabs-mode: nil
# cperl-indent-level: 4
# End:
# vim: syntax=perl sw=4 sts=4 ts=4 et shiftround