User:HBC Archive Indexerbot/source

Main code

# This script is released under the GFDL license, see
# http://en.wikipedia.org/w/index.php?title=User:HBC Archive Indexerbot/source&action=history
# for a full list of contributors

### Configuration ###
# Time to sleep between writes (in seconds)
my $write_delay = 5;
# Max pages to download at once
my $download_max = 25;

# Default template
my $default_template = 'User:HBC Archive Indexerbot/default template';

# Cache paths
my $cache = 'cache';
my $wiki_cache = "$cache/wiki";
my $obj_cache = "$cache/obj";
my $report_cache = "$cache/reports";
### End Configuration ###

use strict;
use warnings;
use Data::Dumper;
use Date::Parse;
use Digest::SHA1 qw(sha1_hex);
use Encode qw(decode_utf8 encode_utf8);
use HTML::Entities qw(encode_entities);
use IO::Socket;
use MediaWiki;
use POSIX qw(strftime);
use Storable;
use Time::Duration;
use Time::Local;
use URI::Escape;
use XML::Simple;


my($log_file,$pages_watched,$pages_downloaded,$pages_attempted,$dl,$ul) = ('',0,0,0,0,0);
my $nowiki = 'nowiki';
my $start_time = undef;
#########################################
# Log into Wikipedia                    #
#########################################
die "Cache directories must be created in advance\n" unless (-d $cache && -d $wiki_cache && -d $obj_cache);
open(PASS,'password');                  # A file with only the password, no carraige return
sysread(PASS, my $password, -s(PASS));  # No password in sourcecode.
close(PASS);
writelog                  ('Connecting to Wikipedia');
my $c                  =   MediaWiki->new;
$c->setup
                        ({
                          'bot' => {'user' => 'HBC Archive Indexerbot','pass' => $password},
                          'wiki' => {'host' => 'en.wikipedia.org','path' => 'w'}
                        }) || die 'Failed to log in';
my $whoami              =  $c->user();
writelog                 ($whoami.' connected');

#########################################
# Update cache to modern state          #
#########################################
writelog                 ('Gathering jobs');
my @master_job_list     =  gather_jobs();
my @post_jobs           = @master_job_list;


writelog                  (scalar(@master_job_list).' jobs found');
writelog                 ('Parsing cache');
my $rh_cache_data       =  parse_cache();
writelog                 ('done');
writelog                 ('Parsing watchlist');
my $rh_modified_data    =  parse_watchlist();
writelog                 ('done');
download_pages(find_updated());
download_pages(find_templates(@master_job_list));
#push                     (@needed,&find_holes());
                           fetch_pages(@master_job_list);
writelog                 ("$pages_watched pages added to watchlist.");
writelog                 ("$pages_downloaded out of $pages_attempted downloaded.");
#########################################
# Parse cached data and create reports  #
#########################################

writelog ('Creating reports');
foreach my $ra_job (@post_jobs) {
  my $page = decode_utf8(encode_utf8($ra_job->{'page'}));
  my $dest = decode_utf8(encode_utf8($ra_job->{'target'}));
  my $dest_escaped = _escape($dest);
  my $mask = decode_utf8(encode_utf8(join(', ', @{$ra_job->{'mask'}})));
  my $index_here = $ra_job->{'indexhere'};
  unless (check_text(1000,$dest_escaped))
    {
    writelog ('Not writing to [['.$dest.']] as I cannot find permission (sourced from: [['.$page.']])');
    next;
    }
  my $report = create_report($ra_job);
  open(REPORT, ">$report_cache/$dest_escaped.$$");
  use bytes;
  print REPORT $report;
  close(REPORT);
  if (-e "$report_cache/$dest_escaped") {
    my $result = `diff --brief "$report_cache/$dest_escaped" "$report_cache/$dest_escaped.$$"`;
    unless ($result) {
      writelog ('No change, skipping [['.encode_entities($dest).']]');
      unlink "$report_cache/$dest_escaped.$$";
      next;
    }
  }
  $c->login();
  writelog ('Writing report at [['.encode_entities($dest).']]');
  my $edit_summary = "Writing index of archives in " . encode_entities($mask) . " due to request from [[ " . encode_entities($page) . "]] - Bot edit";
  my $result = send_report($dest,$report,$edit_summary);
  if ($result) {
      rename "$report_cache/$dest_escaped.$$", "$report_cache/$dest_escaped";
  } else {
      unlink("$report_cache/$dest_escaped.$$");
  }
}

$ul += 120 + length($log_file);
writelog ('Complete, downloaded {{formatnum:'.int($dl/1024).'}} kilobyte(s) and uploaded {{formatnum:'.int($ul/1024).'}} kilobyte(s) (figures approximate)');
&post_log();
exit;

#########################################
# Subroutines                           #
#########################################

sub check_text {
  my $bytes = shift;
  my $page = shift;

  my $host = 'en.wikipedia.org';
  my $path = "/w/index.php?title=$page&action=raw";
  my $sock         = new IO::Socket::INET
      (
       PeerAddr    => $host,
       PeerPort    => 80,
       Proto       => 'tcp',
      );
  return 0 unless ($sock);
  my $header = ('GET http://'.$host.$path.' HTTP/1.1'."\r\n".'User-Agent: HBC Archive Indexerbot 0.9a'."\r\n\r\n");
  syswrite ($sock, $header);
  my($buf , $content, $done);
  while (!$done)
    {
    ($done = 1) unless sysread($sock, $buf, $bytes);
    $content .= $buf;
    if ((length($content) >= $bytes) || ($content =~ m|!-- HBC Archive Indexerbot can blank this --|))
      {
      $done = 1;
      }
    }
  close($sock);
  $dl += length($content);
  return ($content =~ m|!-- HBC Archive Indexerbot can blank this --|);
}

sub create_report {
  my ($ra_job) = @_;
  my ($rh_index, $numbered_links) = index_headings($ra_job);
  my $template = get_template($ra_job->{'template'});
  my $report = sprintf("%sThis report has been generated because of a request at [[%s]]. It covers the archives that match '''%s'''\n<br/>Report generated at ~~~~~ by ~~~\n----\n\n",
                       $template->{'lead'}, $ra_job->{'page'}, join(', ', @{$ra_job->{'mask'}}));
  $report .= $template->{'header'};
  my $i = 0;
  foreach my $key (sort {lc($a) cmp lc($b) || $rh_index->{$a}->{'root_path'} cmp $rh_index->{$b}->{'root_path'}} (keys(%{$rh_index}))) {
    $rh_index->{$key}->{'topic'} =~ s:({{.*?}}|[|!]{2}):<$nowiki>$1</$nowiki>:g;
    my $row = $template->{'row'};
    if ($template->{'altrow'}) {
      unless ($i++ % 2 == 0) {
        $row = $template->{'altrow'}
      }
    }
    foreach ('topic','replies','link','first','last','duration',
             'firstepoch','lastepoch','durationsecs') {
      $row =~ s:%%$_%%:${$rh_index}{$key}{$_}:gi;
    }
    $report .= $row;
  }
  $report .= sprintf("%s\n%s", $template->{'footer'}, $template->{'tail'});
  return $report;
}

sub download_pages {
  my (@pages) = @_;
  return unless @pages;
  my $requests = scalar(@pages);

  my (@received_names);

  while (@pages) {
    my @batch;
    while ((scalar(@batch) < 50) && @pages) {
      my $item = shift(@pages) || last;
      $item = _underscore($item);
      push (@batch, $item);
    }
    $pages_attempted += scalar(@batch);
    my $xml_code = $c->special_export(@batch);
    $dl += length($xml_code);
    my $xml_result = XMLin($xml_code);
    next unless ($xml_result->{'page'});
    if ($xml_result->{'page'}{'title'}) {
      push (@received_names, handle_chunk($xml_result->{'page'}));
    } else {
      foreach my $key (keys %{$xml_result->{'page'}}) {
        push (@received_names, handle_chunk($xml_result->{'page'}->{$key}));
      }
    }
  }
  writelog('Downloaded '.scalar(@received_names)." pages from $requests requests");
  return (@received_names);
}

sub fetch_pages {
  my (@jobs) = @_;

  my (@cache_names) = keys(%$rh_cache_data);
  foreach my $ra_job (@jobs) {
    my @fetch;

    if ($ra_job->{'indexhere'}) {
      my $page = _underscore($ra_job->{'page'});
      push(@fetch, $ra_job->{'page'}) unless (defined($rh_cache_data->{$page}));
    }

    my $fetch_size = 0;
    foreach my $mask (@{$ra_job->{'mask'}}) {
      if ($mask =~ m|<#>|) {
        $fetch_size += 10;
        my $pattern = _underscore($mask);
        my ($part1, $part2) = split(m|<#>|, $pattern, 2);
        $pattern = qr/\Q$part1\E(\d+)/;
        $pattern .= qr/\Q$part2\E/ if $part2;
        my $leading_zeros = $ra_job->{'leading_zeros'}+1;
        my $marker = '%d';
        $marker = '%0'.$leading_zeros.'d' if ($leading_zeros > 1);
        my $printf_pattern = $mask;
        $printf_pattern =~ s|<#>|$marker|;
        my (@mask_pages) = grep(/^$pattern/,@cache_names);
        my $largest = 0;
        foreach my $key (@mask_pages) {
          ($key =~ m|$pattern|) || next;
          $largest = $1 if ($1 > $largest);
        }
        my $count = $largest;
        my (@pages);
        until ($count >= ($largest + $fetch_size)) {
          $count++;
          my $page_name = sprintf($printf_pattern, $count);
          push(@fetch,$page_name);
        }
      # MONTHLY: elsif here for the <date> or whatever is used
      } else {
        my $check = _underscore($mask);
        push (@fetch, $mask) unless (defined($rh_cache_data->{$check}));
      }
    } continue {
      if (scalar(@fetch)) {
        my (@received) = download_pages(@fetch);
        $rh_cache_data = parse_cache();
        (@cache_names) = keys(%$rh_cache_data);
        if (scalar(@fetch) == scalar(@received)) {
          @fetch = ();
          redo;
        } else {
          @fetch = ();
        }
      }
      $fetch_size = 0;
    }
  }
}

sub find_holes  # This sub will find gaps in the archive(mabye a page was deleted then restored) and
  {             # adds them to the list of potentially needed pages
  return();
  }

sub find_templates {
  my (@jobs) = @_;
  my %templates;
  my @templates_needed;
  foreach my $ra_job (@jobs) {
    $templates{$ra_job->{'template'}}++;
  }
  foreach my $template (keys %templates) {
    $template = $default_template if $template eq 'default';
    my $tmpl_under = _underscore($template);
    push(@templates_needed, $template) unless defined($rh_cache_data->{$tmpl_under});
  }
  writelog (scalar(@templates_needed).' templates needed');
  return @templates_needed;
}

sub find_updated # Find items that have changed
  {
  my(@need_update);
  foreach my $page (keys(%{$rh_cache_data})) {
    if ($rh_modified_data->{$page}) { # If it's not on the watchlist, it hasn't
                                      # been modified in the past month, ignore
      if ($rh_cache_data->{$page} < ${$rh_modified_data}{$page}) {
        push(@need_update,$page);
        my $fname = ("$wiki_cache/".uri_escape_utf8($page).' '.$rh_cache_data->{$page});
        unlink($fname); # Remove old item
      }
    }
  }
  writelog (scalar(@need_update).' pages need updating');
  return @need_update;
  }

sub gather_jobs
  {
  my (@jobs);
  my $html_list         =  $c->{ua}->get($c->{index}."?title=Special:Whatlinkshere/User:HBC Archive Indexerbot/OptIn&limit=5000")->content();
  $dl += length($html_list);
  my @targets;
  while ($html_list =~ s|>([^<]*?)</a> \(transclusion\)||)
    {
    push(@targets,$1);
    }
  my $xml_source = XMLin($c->special_export(@targets));
  my $xml = $xml_source;
  $dl += length($xml_source);
  my $rh_pages = ${$xml}{'page'};
  my %targets;
  foreach my $key (keys(%{$rh_pages})) {
    my $content = ${$rh_pages}{$key}{'revision'}{'text'}{'content'};
    if ($content =~ m"\Q{{User:HBC Archive Indexerbot/OptIn\E\s*\|(.+?)\s*\Q}}\E"s) {
      my @params = split(/\s*\|\s*/, $1);
      my %job = ( page => $rh_pages->{$key}{'title'}, leading_zeros => 0 );
      foreach my $param (@params) {
        my ($key, $value) = split(/\s*=\s*/, $param);
        next unless ($key && defined($value));

        $value =~ s:^\.?/:$job{'page'}/:;

        if ($key eq 'target') {
          $job{'target'} = $value;
        } elsif ($key eq 'mask') {
          next unless $value;
          push (@{$job{'mask'}}, $value);
        } elsif ($key =~ /leading_zeroe?s/) {
          if ($value =~ m/^(\d+)$/) {
            $job{'leading_zeros'} = $1;
          }
        } elsif ($key eq 'indexhere') {
          $job{'indexhere'} = (($value =~ m|ye?s?|i) ? ('1') : ('0'));
        } elsif ($key eq 'template') {
          $job{'template'} = $value;
        }

      }
      $job{'template'} = 'default' unless $job{'template'};
      $job{'template'} = 'default' if $job{'template'} eq 'template location';

      next unless ($job{'target'} && $job{'mask'});

      if ($targets{$job{'target'}}) {
        writelog("Request on [[$job{'page'}]] duplicates target [[$job{'target'}]]; skipping");
        next;
      } else {
        $targets{$job{'target'}}++;
      }

      push(@jobs,\%job);
    }
  }
  return @jobs;
  }

sub get_template {
  my ($template) = (@_);

  if ($template eq 'default') {
    $template = $default_template;
  }

  my $tmpl_fn = _escape($template);
  my ($file) = glob("$wiki_cache/$tmpl_fn*");
  unless ($file) {
    if ($template eq $default_template) {
      die "$template missing from cache\n";
    } else {
      return get_template('default');
    }
  }
  open(TMPL, $file);
  my @content = <TMPL>;
  close(TMPL);

  my %template = (lead => '', header => '', row => '', altrow => '',
                  footer => '', tail => '');
  my $section = '';
  foreach my $line (@content) {
    chomp $line;
    if ($line =~ m:^<!--\s*(.*?)\s*-->$:) {
      $section = lc($1);
      $section =~ s/\s+//g;
      last if $section eq 'end';
    } else {
      if ($section) {
        next unless $line;
        $template{$section} .= "$line\n";
      }
    }
  }
  $template{'lead'} .= "\n" if $template{'lead'};

  unless ($template{'row'}) {
    die "Default template missing 'row' parameter!\n" if $template eq $default_template;
    writelog("Invalid template: '$template', using default instead");
    return get_template('default');
  }

  return \%template;
}

sub handle_chunk {
  my $chunk = shift;
  my $name = _underscore(${$chunk}{'title'});
  my $fname = "$wiki_cache/".uri_escape_utf8($name);
  ${$chunk}{'revision'}{'timestamp'} =~ m|(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z|;
  my $time = timegm($6,$5,$4,$3,$2-1,$1);
  watch($name) unless (${$rh_cache_data}{$name});
  open(OUT,">$fname $time");
  binmode(OUT);
  use bytes;
  print OUT (${$chunk}{'revision'}{'text'}{'content'});
  no bytes;
  close(OUT);
  $pages_downloaded++;
  return $name;
}

sub index_headings {
  my ($ra_job) = @_;

  my $mask_re = '';
  foreach my $mask (@{$ra_job->{'mask'}}) {
    my $mask2 = _escape($mask);
    if ($mask2 =~ m|%3C%23%3E|) {
      my ($part1, $part2) = split(m|%3C%23%3E|, $mask2, 2);
      $mask_re .= '(?:';
      $mask_re .= qr/\Q$part1\E\d+/;
      $mask_re .= qr/\Q$part2\E/ if $part2;
      $mask_re .= ')|';
    # MONTHLY: elsif here for <date>
    } else {
      $mask_re .= qr/\Q$mask2\E/.'|';
    }
  }
  chop($mask_re);

  opendir(CACHE,$wiki_cache);
  my(@cache) = readdir(CACHE);
  closedir(CACHE);
  my @files = grep(m|^(?:$mask_re)|,@cache);
  if ($ra_job->{'indexhere'}) {
    my $page = _escape($ra_job->{'page'});
    push(@files, grep(m|^\Q$page\E \d+$|,@cache));
  }
  my (%index, %used_headings);
  my $numbered_links = 0;
  foreach my $file (@files) {
    my (%used_names);
    next unless ($file =~ m|^(.*) (\d+)$|);
    my $root_path = decode_utf8(uri_unescape($1));
    my $display_name = $root_path;
    $display_name =~ s/_/ /g;
    open(WIKI, "$wiki_cache/$file");
    my @content = <WIKI>;
    close(WIKI);
    my $prev_heading = '';
    my ($comment_count,$first,$last) = (0,0,0);
    foreach my $line (@content) {
      if ($line =~ m|^==\s*([^=].+?)\s*==|) {
        if ($prev_heading && $comment_count > 0) {
          ## WARNING: This code is duplicated below vvvvvv
          $index{$prev_heading}->{'replies'} = $comment_count;
          if ($first && $last) {
            $index{$prev_heading}->{'firstepoch'} = $first;
            $index{$prev_heading}->{'first'} = strftime('%F %T',gmtime($first));
            $index{$prev_heading}->{'lastepoch'} = $last;
            $index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last));
            $index{$prev_heading}->{'durationsecs'} = $last - $first;
            if ($comment_count > 1) {
              $index{$prev_heading}->{'duration'} = duration($last - $first);
            } else {
              $index{$prev_heading}->{'duration'} = 'None';
            }
          }
          $comment_count = 0;
          $first = 0;
          $last = 0;
        }
        my $heading = $1;
        my $head_link;
        ($head_link, $numbered_links) = path_fix($heading, $numbered_links);
        $used_names{lc($head_link)}++;
        my $suffix = (($used_names{lc($head_link)} > 1) ? ('_'.$used_names{lc($head_link)}) : (''));
        $used_headings{lc($head_link.$suffix)}++;
        $prev_heading = $head_link.$suffix.'_'.$used_headings{lc($head_link.$suffix)};
        $index{$prev_heading} = { topic => encode_entities(decode_utf8($heading)), link => ("[[{{urlencode:$root_path}}#$head_link".$suffix."|$display_name]]"),
                                  root_path => $root_path, head_link => $head_link,
                                  replies => 'Unknown', first => 'Unknown',
                                  'last' => 'Unknown', duration => 'Unknown',
                                  firstepoch => 0, lastepoch => 0,
                                  durationsecs => 0,
                                };
      } elsif ($line =~ m/\[\[User.*[\]>)}].*?\s+(.*\(UTC\))/) {
        $comment_count++;
        my $time = str2time($1);
        if ($time && (!$first || $time < $first)) {
          $first = $time;
        }
        if ($time && ($time > $last)) {
          $last = $time;
        }
      }
    }
    if ($prev_heading && $comment_count > 0) {
      ## WARNING: This code is duplicated from above ^^^^^^
      $index{$prev_heading}->{'replies'} = $comment_count;
      if ($first && $last) {
        $index{$prev_heading}->{'firstepoch'} = $first;
        $index{$prev_heading}->{'first'} = strftime('%F %T', gmtime($first));
        $index{$prev_heading}->{'lastepoch'} = $last;
        $index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last));
        $index{$prev_heading}->{'durationsecs'} = $last - $first;
        if ($comment_count > 1) {
          $index{$prev_heading}->{'duration'} = duration($last - $first);
        } else {
          $index{$prev_heading}->{'duration'} = 'None';
        }
      }
    }
  }
  return \%index;
}

sub parse_cache
  {
  my (@pages,$count);
  opendir(CACHE,$wiki_cache);
  my(@files) = readdir(CACHE);
  closedir(CACHE);
  my(%cache);
  foreach my $file (@files)
    {
    next unless ($file =~ m|^(.*) (\d+)$|);
    my $page_name = decode_utf8(uri_unescape($1));
    my $time = $2;
    $cache{$page_name} = $time;
    }
  return \%cache;
  }

sub parse_watchlist
  {
  my $watchlist         =  $c->{ua}->get($c->{index}."?title=Special:Watchlist&days=0")->content();
  $dl += length($watchlist);
  my @lines             =  split("\n",$watchlist);
  my @date;
  my %watchlist;
  while (scalar(@lines))
    {
    my $line = shift(@lines);
    if ($line =~ m|<h4>(\d{4})-(\d{2})-(\d{2})</h4>|i)
      {
      @date = ($1,$2,$3);
      }
    if ($line =~ m|title="([^"]*?)">hist</a>|i) # "
      {
      my $page_name = _underscore($1);
      $line =~ m|(\d{2}):(\d{2}):(\d{2})|;
      $watchlist{$page_name} = timegm($3,$2,$1,$date[2],$date[1]-1,$date[0]);
      }
    }
  return \%watchlist;
  }

sub path_fix {
  my ($path,$numbered_links) = @_;
  ($path =~ s|'{2,4}||g);
  ($path =~ s|<.*?>||g);
  ($path =~ s/\[\[:?.*?\|(.*?)\]\]/$1/g);
  ($path =~ s|\[\[:?(.*?)\]\]|$1|g);
  while ($path =~ m|\[.*?\]|) {
    my $title;
    if ($path =~ m|\[[^ ]* (.*?)\]|) {
      $title = $1;
    } else {
      $numbered_links++;
      $title = ".5B$numbered_links.5D";
    }
    $path =~ s|\[.*?\]|$title|;
  }
  ($path =~ s|\s|_|g);
  ($path =~ s| |.C2.A0|g);
  while ($path =~ m|([^/a-z0-9\.:_'-])|i) {
    my $bad = $1;
    my $fix = uc('.'.sprintf("%x",ord($bad)));
    ($path =~ s/\Q$bad/$fix/g);
  }
  return ($path,$numbered_links);
}

sub post_log {
  my $pg                =  $c->get('User:HBC Archive Indexerbot/logs', 'rw');
  $pg->{summary}        =  ('Writing log file for '.$start_time).' - Bot edit';
  $pg->{content}        =  $log_file;
  $pg->save();
}

sub send_report {
  my $dest      = shift;
  my $report    = shift;
  my $edit_summary = shift;
  my $pg        = $c->get($dest, 'w');
  $pg->{summary}        =  $edit_summary;
  $pg->{content}        =  '<!-- HBC Archive Indexerbot can blank this -->'."\n".$report;
  $ul += length($report);
  my $result = $pg->save();
  unless ($result) {
      my $dest_entities = encode_entities($dest);
      writelog("Failed to save report to $dest_entities");
  }
  sleep($write_delay);
  return $result;
}

sub watch
  {
  my $page_name = shift;
  my $success = $c->{ua}->get($c->{index}."?title=$page_name&action=watch")->is_success;
  $pages_watched++ if ($success);
  return $success;
  }

sub writelog {
  my $entry = shift;
  my @month_table =
  (
   'January',
   'February',
   'March',
   'April',
   'May',
   'June',
   'July',
   'August',
   'September',
   'October',
   'November',
   'December',
  );
  my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime(time);
  my $time = sprintf("%02d:%02d:%02d %02d %s %04d", $hour,$min,$sec,$mday,$month_table[$mon],($year+1900));
  $start_time ||= $time;
  $log_file .= ('* '.$time.': '.$entry.' ~~~'."\n");
  warn "$entry\n";
}

sub _escape {
  my ($val) = @_;
  $val = _underscore($val);
  $val = uri_escape_utf8($val);
  return $val;
}

sub _hash {
  my ($val) = @_;
  $val = _escape($val);
  $val = sha1_hex($val);
  return $val;
}

sub _underscore {
  my ($val) = @_;
  $val =~ s|\s|_|g;
  return $val;
}

MediaWiki Change

Fixing an error in MediaWiki/page.pm (forgotten &):

return $obj->{ua}->get($obj->_wiki_url . "&action=" . ($unwatch ? "un" : "") . "watch")->is_success;