User:Plastikspork/tfd helper.pl

#!/usr/bin/perl

# Insist that all variables be declared
use strict;

# Allow for utf8 characters
use utf8;

# Use the MediaWiki::Bot library
use MediaWiki::Bot;

# Use the Encode library
use Encode;

# Declare some global variables
my ($user, $pass) = ("","");
my ($task, $edit_summary, $is_minor) = ("","","");
my (@pagelist, @tasklist, @templatelist, @regexplist);
my ($namespace_string, $templates) = ("","");

# Set default values
$is_minor = 1; # Label edits as minor
$user = 'SporkBot';
$edit_summary = 'Orphan per [[WP:TFD|TFD outcome]]';
$task = 0;

my @tasklist = (
  "Remove a template and trailing whitespace and newline",
  "Replace a template and whitespace with a single space",
  "Replace a template with nothing",
  "Substitute a template and use subst=subst:",
  "Substitute a template"
);

# Do we want to orphan or substitute?
print "What task is being performed?\n";
for( my $i=0; $i <= $#tasklist; $i++) {
  print $i .") ". $tasklist[$i] ."\n";
}
print "Press enter for default [".$task."]\n";
print "Task: ";
$task = &my_get_response($task);

# Get the name of the template to orphan
print "Which template(s) would you like to orphan?\n";
print "For a list, delimit with commas\n";
print "Template: ";
$templates = &my_get_response("BLANK RESPONSE");

# Replace underscores and excess spaces
$templates =~ s/[_ ]+/ /g;
$templates =~ s/^ [ ]*//;
$templates =~ s/ [ ]*$//;

# Strip off leading "Template:" and
$templates =~ s/[ ]*[Tt]emplate[ ]*:[ ]*//;

# Create the template orphan list
@templatelist = split /[ ]*,[ ]*/, $templates;

# Set the namespace
$namespace_string = &my_get_namespace_list();

# Get username
print "Username [".$user."]: ";
$user = &my_get_response($user);

# Get password
print "Password [".$pass."]: ";
$pass = &my_get_response($pass);

# Create an editor object on the English language WP
my $editor=MediaWiki::Bot->new($user);
$editor->set_wiki('en.wikipedia.org','w');

# Turn debugging on, to see what the script is doing
$editor->{debug} = 1;

# Log in
if ( $editor->login($user, $pass) ) {
  print "Failed to login!\n";
  exit;
}

# Attempt to get the link to the TFD in question, using first on list
my @plist = $editor->what_links_here_ns("Template:".$templatelist[0],4);
# Loop over all pages in the page list
foreach my $pagestruct (@plist) {
  # Only want links, not transclusions
  if ( $pagestruct->{'type'} =~ /^$/ ) {
    # Get the actual page name from the page structure
    my $page = $pagestruct->{'title'};
    if ( $page =~ /\/log\//i ) {
      if ( ($task == 3) or ($task == 4) ) {
        $edit_summary = "Replace template per ".
          "[[".$page."|TFD outcome]]; no change in content";
      } else {
        $edit_summary = "Remove template per ".
          "[[".$page."|TFD outcome]]";
      }
    }
  }
}

# Allow user to override edit summary
print "The current edit summary is set to:\n$edit_summary\n";
print "Provide a new edit summary, or press enter to use the default\n";
print "Summary: ";
$edit_summary = &my_get_response($edit_summary);

# Create a list of regular expressions
foreach (@templatelist) {
  my $str = $_;
  my @rlist = $editor->what_links_here_ns("Template:".$str, 10);
  # Escape any characters used in regular expressions
  $str =~ s/([()\-\.\[\]])/\\$1/gi;
  # First character is case insensitive
  $str =~ s/^([A-Za-z])(.*)$/[\u$1\l$1]$2/;
  # Wikipedia allows for underscores and spaces in template references
  $str =~ s/ /[_ ]+/gi;
  # Add template to the list
  push @regexplist, $str;

  # Add redirects to the list
  foreach (@rlist) {
    my $tstruct = $_;
    # Only want redirects, not links or transclusions
    if( $tstruct->{'type'} =~ /redirect/ ) {
      $str = $tstruct->{'title'};
      # Remove leading Template: in name
      $str =~ s/^Template://;
      # Escape any characters used in regular expressions
      $str =~ s/([()\-\.\[\]])/\\$1/gi;
      # First character is case insensitive
      $str =~ s/^([A-Za-z])(.*)$/[\u$1\l$1]$2/;
      # Wikipedia allows for underscores and spaces in template references
      $str =~ s/ /[_ ]+/gi;
      push @regexplist, $str;
    }
  }
}

# Get the list of all transclusions in the specified namespaces
foreach my $template (@templatelist) {
  if ($namespace_string) {
    # Loop over each namespace and get the list of transclusions
    foreach my $ns (split /\&/, $namespace_string) {
      if ( $ns =~ /ns([0-9]+)[=]1/ ) {
     my @alist = $editor->what_links_here_ns("Template:".$template, $1);
     foreach my $pagestruct (@alist) {
       # Only want transclusions, not links
       if ( $pagestruct->{'type'} =~ /transclusion/ ) {
         push @pagelist, $pagestruct->{'title'};
       }
     }
      }
    }
  } else {
    # Get the list of transclusions in all namespaces
    my @alist = $editor->what_links_here_ns("Template:".$template, '');
    foreach my $pagestruct (@alist) {
      # Only want transclusions, not links
      if ( $pagestruct->{'type'} =~ /transclusion/ ) {
     push @pagelist, $pagestruct->{'title'};
      }
    }
  }
}

# Remove duplicates from the page list
@pagelist = sort(@pagelist);
my $prev = "not ".$pagelist[0];
@pagelist = grep($_ ne $prev && ($prev = $_, 1), @pagelist);

# Confirm job paramters before starting
if( &my_confirm_job() ) {
  # Job not confirmed, so exit
  exit;
}

# Loop over all pages in the page list
foreach my $page (@pagelist) {
  # Show which pages are being processed
  print "Processing: ".$page."\n";
     
  # Pull the wikitext for the page
  my $text=$editor->get_text($page);
  # Save the old wikitext for diff
  my $old = $text;

  # Split the page into chunks to isolate comments
  my @chunks = split(/(<!--.*?-->)/s, $text);

  # Loop over sections
  foreach $text (@chunks) {
    if ($text =~ /<!--/) {
      # Skip chunks consisting of only comments
    } else {
      foreach my $str (@regexplist) {
        if ($task == 0) {
          # Remove the template and trailing spaces/newlines
          $text =~ s/{{[_ ]*(?:[Tt]emplate:|)[_ ]*($str)[_ ]*(\||}})[ ]*[\r\n]?//g;
        } elsif ($task == 1) {
          # Replace the template with whitespace
          $text =~ s/[ ]*{{[_ ]*(?:[Tt]emplate:|)[_ ]*($str)[_ ]*(\||}})[ ]*/ /g;
        } elsif ($task == 2) {
          # Replace the template with nothing
          $text =~ s/{{[_ ]*(?:[Tt]emplate:|)[_ ]*($str)[_ ]*(\||}})//g;
        } elsif ($task == 3) {
          # Substitute the template with subst=subst:
          $text =~ s/{{[_ ]*(?:[Tt]emplate:|)[_ ]*($str)[_ ]*(\||}})/{{subst:$1|subst=subst:$2/g;
        } elsif ($task == 4) {
          # Substitute the template
          $text =~ s/{{[_ ]*(?:[Tt]emplate:|)[_ ]*($str)[_ ]*(\||}})/{{subst:$1$2/g;
        }
      }
    }
  }
  $text = join('', @chunks);

  &my_simple_diff($old,$text);

  print "Press enter to submit, s to skip, or ctrl-c to quit\n";
  my $response = &my_get_response("");

  if ($response =~ /s/i) {
    print "Skipping...\n";
  } else {
    print "Submitting...\n";
    # Submit to Wikipedia.
    # Warning: This does not warn of edit conflicts.
    $editor->edit($page, $text, $edit_summary, $is_minor);
  }

  # Take a break (frequent edits are forbidden per bot policy)
  print "Sleep 10\n";
  sleep 10;
}

sub my_get_namespace_list() {
  my $all_ns   = '&ns0=1&ns1=1&ns2=1&ns3=1&ns4=1&ns5=1&ns6=1&ns7=1'
                .'&ns8=1&ns9=1&ns10=1&ns11=1&ns12=1&ns13=1&ns14=1'
                .'&ns15=1&ns100=1&ns101=1&ns108=1&ns109=1';
  my $no_ns    = '&ns0=0&ns1=0&ns2=0&ns3=0&ns4=0&ns5=0&ns6=0&ns7=0'
                .'&ns8=0&ns9=0&ns10=0&ns11=0&ns12=0&ns13=0&ns14=0'
                .'&ns15=0&ns100=0&ns101=0&ns108=0&ns109=0';
  my ($num,$list) = ("","");

  print "Provide a list of namespaces in which the script will operate\n";
  print "-------------- ------------------- -------------- -----------------\n";
  print "  0 Article      1 Talk              2 User         3 User talk\n";
  print "  4 Wikipedia    5 Wikipedia talk    6 File         7 File talk\n";
  print "  8 MediaWiki    9 MediaWiki talk   10 Template    11 Template talk\n";
  print " 12 Help        13 Help talk        14 Category    15 Category talk\n";
  print "100 Portal     101 Portal talk     108 Book       109 Book talk\n";
  print "-------------- ------------------- -------------- -----------------\n";
  print "* Delimit list with commas for multiple namespaces\n";
  print "* Start the list with 'no' for the compliment of the list\n";
  print "* Leave blank for everything\n";
  print "Namespace: ";
  my $ns = <STDIN>;

  # Remove spaces and newlines
  $ns =~ s/[\t\r\n ]//g;

  if( $ns =~ /^$/ ) {
    # Everything
    $list = "";
  } elsif( $ns =~ /^no[t ]*([0-9].*)$/ ) {
    # Set compliment
    $ns = $1;
    $list = $all_ns;
    foreach $num (split /[ ]*,[ ]*/, $ns) {
      $list =~ s/ns($num)=1/ns$1=0/g;
    }
  } else {
    # Set
    $list = $no_ns;
    foreach $num  (split /[ ]*,[ ]*/, $ns) {
      $list =~ s/ns($num)[=]0/ns$num=1/g;
    }
  }
  $list =~ s/\&ns[0-9]+=0//g;

  return $list;
}

sub my_confirm_job()
{
 my $opt = 4;
 while( ($opt < 5) && ($opt > 0) ) {
  if ( $opt == 1 ) {
    print "----------------------------------------------------------------\n";
    print "                         Template list                          \n";
    print "----------------------------------------------------------------\n";
    foreach (@templatelist) {
      print $_."\n";
    }
  } elsif ( $opt == 2 ) {
    print "----------------------------------------------------------------\n";
    print "                     Regular expression list                    \n";
    print "----------------------------------------------------------------\n";
    foreach (@regexplist) {
      print $_."\n";
    }
  } elsif ( $opt == 3 ) {
    print "----------------------------------------------------------------\n";
    print "                           Page list                            \n";
    print "----------------------------------------------------------------\n";
    foreach (@pagelist) {
      print $_."\n";
    }
  } elsif ( $opt == 4 ) {
    print "----------------------------------------------------------------\n";
    print "                         Edit Summary                           \n";
    print "----------------------------------------------------------------\n";
    print "\n".$edit_summary."\n\n";
    print "----------------------------------------------------------------\n";
    print "                          Job Summary                           \n";
    print "----------------------------------------------------------------\n";
    print "\n";
    print " Username:                        ". $user ."\n";
    print " Task:                            ". $tasklist[$task] . "\n";
    print " Number of templates:             ". scalar @templatelist ."\n";
    print " Number of templates + redirects  ". scalar @regexplist ."\n";
    print " Number of pages to process       ". scalar @pagelist ."\n";
    print " Namespace string                 ". $namespace_string ."\n";
    print "----------------------------------------------------------------\n";
    print "\n";
  } else {
    print " Unknown option\n";
    $opt = 5;
  }
  print " 0) Start job\n";
  print " 1) Show list of templates\n";
  print " 2) Show list of regular expressions\n";
  print " 3) Show list of pages\n";
  print " 4) Redisplay summary\n";
  print " 5) Quit\n";
  print "Option: ";
  $opt = <STDIN>;
  # Remove newlines, underscores, and excess spaces
  $opt =~ s/[_ \r\n]+//g;

  if ( $opt == 5 ) {
    return 1;
  }
 }

 return 0;
}

# Simple keyboard response
sub my_get_response()
{
  my $val = shift;
  my $in = <STDIN>;
  # Replace newlines and tabs with spaces
  $in =~ s/[\t\r\n]/ /g;
  # Remove leading spaces
  $in =~ s/^ [ ]*//;
  # Remove trailing spaces
  $in =~ s/ [ ]*$//;
  if( $in =~ /^$/ ) {
    return $val;
  } else {
    return $in;
  }
}

# Very basic module for showing diffs
sub my_simple_diff()
{
  my $a = shift;
  my $b = shift;

  my @alines = split /[\r\n]/, $a;
  my @blines = split /[\r\n]/, $b;

  my $i = 0;
  my $j = 0;

  while( $i < scalar @alines and $j < scalar @blines ) {
    if( $alines[$i] ne $blines[$j] ) {
      if( $alines[$i+1] eq $blines[$j] ) {
        print "**** Diff ($i,-) ****\n";
        print "- ".$alines[$i]."\n";
        $i++;
      } else {
        print "**** Diff ($i,$j) ****\n";
        print "+ ". $blines[$j]."\n";
        print "- ". $alines[$i]."\n";
      }
    }
    $i++;
    $j++;
  }
}

# Custom module for namespace restricted "what links here" list
sub MediaWiki::Bot::what_links_here_ns {
  my $self    = shift;
  my $article = shift;
  my $ns      = shift;
  my @links;

  $article = MediaWiki::Bot::uri_escape_utf8( $article );

  my $res =
    $self->_get( 'Special:Whatlinkshere', 'view',
      "&target=$article&limit=5000&namespace=$ns" );
  unless (ref($res) eq 'HTTP::Response' && $res->is_success) { return 1; }
  my $content = $res->decoded_content;
  while (
    $content =~ m{<li><a href="[^"]+" title="([^"]+)">[^<]+</a>([^<]*)}g ) {
    my $title = $1;
    my $type  = $2;
    if ( $type !~ /\(redirect page\)/ && $type !~ /\(transclusion\)/ ) {
      $type = "";
    }
    if ( $type =~ /\(redirect page\)/ ) { $type = "redirect"; }
    if ( $type =~ /\(transclusion\)/ )  { $type = "transclusion"; }

    push @links, { title => $title, type => $type };
  }

  return @links;
}