User:Lar/ClassificationTableGen

Old version at: User:Lar/ClassificationTableGen/Backlev

Perl code: This code generated User:Lar/Sandbox2 (version 8).. There is a lot of work to do on it yet but if you stumble across this, feedback welcome. Not ready for public release yet (if ever).

Signature shows update time/date ++Lar: t/c 03:34, 27 April 2006 (UTC)

note: requires perl 5.8.... only tested on Wintel platform.

Helper module, based on Pearle Wisebot code edit

... this module needs some better commenting and trimming out of code not needed for what I need. It ALSO needs the user/cookie/token stuff made generic. It is hard coded to use my userid so may not work for anyone else (no you can't have my cookie file. Read the User:Pearle pages on how to set up your own and look for Lar in there and change it to your userid.

Filename WP_util_pearlebot.pm

package WP_util_pearlebot;  # assumes WP_util_pearlebot.pm

# based on boilerplate module declaration found here:
#   http://perldoc.perl.org/perlmod.html#Perl-Modules-module
#
# based on code that is part of the "Pearle Wisebot"
#   http://en.wikipedia.org/wiki/User:Pearle
#   http://en.wikipedia.org/wiki/User:Pearle/pearle-documentation.txt
#   http://en.wikipedia.org/wiki/User:Pearle/pearle.pl
# which was created by [[User:Beland]]: 
#  http://en.wikipedia.org/wiki/User:Beland

# Mods by Larry Pieniazek ( [[user:Lar]] )

use strict;
use warnings;

use Data::Dumper;
use Getopt::Std;

use Time::HiRes;


use utf8;
#use encoding 'utf8';

# Initialization
use LWP;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST);
use HTML::Entities;



    BEGIN {
        use Exporter   ();
        our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);

        # set the version for version checking
        $VERSION     = 1.00;
        # if using RCS/CVS, this may be preferred
        # $VERSION = sprintf "%d.%03d", q$Revision: 1.1 $ =~ /(\d+)/g;

        @ISA         = qw(Exporter);
        @EXPORT      = qw(&func1 &func2 &func4);
        %EXPORT_TAGS = ( );     # eg: TAG => [ qw!name1 name2! ],

        # your exported package globals go here,
        # as well as any optionally exported functions
        # @EXPORT_OK   = qw($Var1 %Hashit &func3);
        # we do not have any externals
    }
    our @EXPORT_OK;

    # exported package globals go here
    # our $Var1;
    # our %Hashit;

    # non-exported package globals go here


    # initialize package globals, first exported ones


    # then the others (which are still accessible as $WP_util_pearlebot::stuff)


    # all file-scoped lexicals must be created before
    # the functions below that use them.

    # file-private lexicals go here


    # here's a file-private function as a closure,
    # callable as &$priv_func;  it cannot be prototyped.
    # my $priv_func = sub {
        # stuff goes here.
    # };

    # make all your functions, whether exported or not;
    # remember to put something interesting in the {} stubs
    sub myLog;
    sub getPage;
    sub postPage;
    sub retry;
    sub printWikitext;
    sub test;


    END { }       # module clean-up code here (global destructor)

    ## YOUR CODE GOES HERE


# LWP:UserAgent is a library which allows us to create a "user agent"
# object that handles the low-level details of making HTTP requests.

$::ua = LWP::UserAgent->new(timeout => 300);
$::ua->agent("LarUtil/0.1");
$::ua->cookie_jar(HTTP::Cookies->new(file => "cookies.lar.txt",
				     autosave => 1));
# $::ua->cookie_jar->load();

# $::ua->

#
#  $ua = LWP::UserAgent->new;
#  $req = HTTP::Request->new(GET => 'http://www.linpro.no/secret/');
#  $req->authorization_basic('aas', 'mypassword');
#  print $ua->request($req)->as_string;

# Hot pipes
$| = 1;

#set default speedlimit

 $::speedLimit = 10;

##---

## test();



sub test
{
    my ($target, $text, $editTime, $startTime, $token);

    #$target = "Special:Userlogin";
    #($text, $editTime, $startTime, $token) = getPage($target);

    # temporary
    $::nullOK = "yes";



    $target = "Wikipedia:Sandbox";
    ($text, $editTime, $startTime, $token) = getPage($target);
    print $text;

#    die ("nopost Test complete.");

    $text .= "\nEat my electrons! -- testing Pearle clone ([[User:Lar]]) \n";
    print "---\n";
    postPage ($target, $editTime, $startTime, $token, $text, "Test 028", "najor");  # (no it is not minor)
    die ("Test complete.");
}

##---


sub getPage
{

    my ($target, $request, $response, $reply, $text, $text2,
	$editTime, $startTime, $attemptStartTime, $attemptFinishTime,
	$token);

    $target = $_[0];

    if ($target =~ m/^\s*$/)
    {
	myLog("getPage: Null target.");
	die("getPage: Null target.");
    }

    # urlSafe ($target);

    # Monitor wiki server responsiveness
    $attemptStartTime = Time::HiRes::time();

    # Create a request-object
    print "GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n";
    myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n");
    
    $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit");

    $response = $::ua->request($request);

    if ($response->is_success)
    {
	$reply = $response->content;

	# Monitor wiki server responsiveness
	$attemptFinishTime = Time::HiRes::time();
	retry ("success", "getPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime));
	
	# This detects whether or not we're logged in.
	unless ($reply =~ m%<a href="/wiki/User_talk:Lar">My talk</a>%)
	{
	    # We've lost our identity.
	    myLog ("Wiki server is not recognizing me (1).\n---\n${reply}\n---\n");
	    die ("Wiki server is not recognizing me (1).\n");
	}      
        
        my $saveReply=$reply;
        myLog ("Dump reply prior to regex processing in getPage... \n---\n${saveReply}\n---\n");
        
	$reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s;
	$text = $1;

        
#	$reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s;
#	$reply =~ m%<textarea\s*tabindex='1'\s*accesskey=","\s*name="wpTextbox1"\s*rows='25'\s*cols='80'\s*>(.*?)</textarea>%s;
#	$text = $1;

#        print "debug: 1: ".$1."\n";

	$reply =~ m/value="(\d+)" name="wpEdittime"/;
	$editTime = $1;

	# Added 22 Aug 2005 to correctly handle articles that have
	# been undeleted
	$reply =~ m/value="(\d+)" name="wpStarttime"/;
	$startTime = $1;

	# Added 9 Mar 2005 after recent software change.
	$reply =~ m/value="(\w+)" name="wpEditToken"/;
	$token = $1;
	###

	if (($text =~ m/^\s*$/)
	    and ($::nullOK ne "yes"))
	{
	    myLog ("getPage($target): Null text!\n");
	    myLog "\n---\n$reply\n---\n";
	    die ("getPage($target): Null text!\n");
	}

	if (($editTime =~ m/^\s*$/)
	    and ($::nullOK ne "yes"))
	{
	    myLog ("getPage($target): Null time!\n");
	    myLog "\n---\n$reply\n---\n";
	    die ("getPage($target): Null time!\n");
	}

	if (($text =~ m/>/) or
	    ($text =~ m/</))
	{
	    print $text;
	    myLog "\n---\n$text\n---\n";
	    myLog ("getPage($target): Bad text suck!\n");
	    die ("getPage($target): Bad text suck!\n");
	}
	
	# Change ( " -> " ) etc
	# This function is from HTML::Entities.
	decode_entities($text);

	# This may or may not actually work
	$::ua->cookie_jar->save();

	return ($text, $editTime, $startTime, $token);
    } 
    else 
    {
	myLog ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n");
	print ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n");
	# 50X HTTP errors mean there is a problem connecting to the wiki server
	if (($response->status_line =~ m/^500/)
	    or ($response->status_line =~ m/^502/)
	    or ($response->status_line =~ m/^503/))
	{
	    return(retry("getPage", @_));
	}
	else
	{
	    # Unhandled HTTP response
	    die ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n");
	}
    }
}

sub postPage
{
    my ($request, $response, $pageName, $textToPost, $summaryEntry,
	$editTime, $startTime, $actual, $expected, $attemptStartTime,
	$attemptFinishTime, $date, $editToken, $minor);

    $pageName = $_[0];
    $editTime = $_[1];
    $startTime = $_[2];
    $editToken = $_[3];
    $textToPost = $_[4];
    $summaryEntry = $_[5]; # Max 200 chars!
    $minor = $_[6];

    $summaryEntry = substr($summaryEntry, 0, 200);

    if ($pageName eq "")
    {
	myLog ("postPage(): Empty pageName.\n"); 
	die ("postPage(): Empty pageName.\n"); 
    }

    if ($summaryEntry eq "")
    {
	$summaryEntry = "Automated editing.";
    }
    
    # Monitor server responsiveness
    $attemptStartTime = Time::HiRes::time();

    if ($minor eq "yes")
    {
	$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit",
	[wpTextbox1 => $textToPost,
	 wpSummary => $summaryEntry,
	 wpSave => "Save page",
	 wpMinoredit => "on",
	 wpEditToken => $editToken,
	 wpStarttime => $startTime,
	 wpEdittime => $editTime];
	# Optional: wpWatchthis
    }
    else
    {
	$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit",
	[wpTextbox1 => $textToPost,
	 wpSummary => $summaryEntry,
	 wpSave => "Save page",
	 wpEditToken => $editToken,
	 wpStarttime => $startTime,
	 wpEdittime => $editTime];
	# Optional: wpWatchthis, wpMinoredit
    }

    # ---
    ## If posts are failing, you can uncomment the below to see what
    ## HTTP request is being made.
    # myLog($request->as_string());
    # print $request->as_string();	$::speedLimit = 60 * 10;
    # print $::ua->request($request)->as_string;
    # ---

    myLog("POSTing...");
    print "POSTing...";
    # Pass request to the user agent and get a response back
    $response = $::ua->request($request);
    myLog("POSTed.\n");
    print "POSTed.\n";


    if ($response->content =~ m/Please confirm that really want to recreate this article./)
    {
	myLog ($response->content."\n");
	die ("Deleted article conflict! See log!");
    }


    # Check the outcome of the response
    if (($response->is_success) or ($response->is_redirect))
    {
	# Monitor server responsiveness
	$attemptFinishTime = Time::HiRes::time();
	retry ("success", "postPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime));


	$expected = "302 Moved Temporarily";
	$actual = $response->status_line;
	if (($expected ne $actual)
	    and ($actual ne "200 OK"))
	{
	    myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n");
	    myLog ($request->as_string());
	    myLog ("EXPECTED: '${expected}'\n");
	    myLog ("  ACTUAL: '${actual}'\n");

	    die ("postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n");
	}

	$expected = "http://en.wikipedia.org/wiki/${pageName}";
	$expected =~ s/\'/%27/g;
	$expected =~ s/\*/%2A/g;
	# $expected = urlEncode($expected);

	$actual = $response->headers->header("Location");


	if (($expected ne $actual)
 	    and !(($actual eq "") and ($response->status_line eq "200 OK")))
 	{
 	    myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n");
	    myLog ("EXPECTED: '${expected}'\n");
	    myLog ("  ACTUAL: '${actual}'\n");
	    die ("postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n");
	}


	if ($response->content =~ m/<h1 class="firstHeading">Edit conflict/)
	{
	    myLog ("Edit conflict on '$pageName' at '$editTime'!\n");
	    die ("Edit conflict on '$pageName' at '$editTime'!\n");
	}

	$::ua->cookie_jar->save();
	return ($response->content);
    }
    else
    {
	$date = `date`;
	$date =~ s/\n//g;
	myLog ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n");

	# 50X HTTP errors mean there is a problem connecting to the wiki server
	if (($response->status_line =~ m/^500/)
	    or ($response->status_line =~ m/^502/)
	    or ($response->status_line =~ m/^503/))
	{
	    print "Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n";
	    return(retry("postPage", @_));
	}
	else
	{
	    # Unhandled HTTP response
	    die ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n");
	}
    }
}

 
 
sub myLog
{
    open (LOG, ">>pearle-wisebot.ersatz.log.txt") 
	|| die "Could not append to log!";
    print LOG $_[0];
    close (LOG);
}



# A call to this recursive function handles any retries necessary to
# wait out network or server problems.  It's a bit of a hack.
sub retry
{

    my ($callType, @args, $i, $normalDelay, $firstRetry,
	$secondRetry, $thirdRetry);

    ($callType, @args) = @_;

    ### ATTENTION ###
    # Increasing the speed of the bot to faster than 1 edit every 10
    # seconds violates English Wikipedia rules as of April, 2005, and
    # will cause your bot to be banned.  So don't change $normalDelay
    # unless you know what you are doing.  Other sites may have
    # similar policies, and you are advised to check before using your
    # bot at the default speed.
    #################

    # HTTP failures are usually an indication of high server load.
    # The retry settings here are designed to give human editors
    # priority use of the server, by allowing it ample recovering time
    # when load is high.

    # Time to wait before retry on failure, in seconds
    $normalDelay = 10;       # Normal interval between edits is 10 seconds
    $firstRetry = 60;        # First delay on fail is 1 minute
    $secondRetry = 60 * 10;  # Second delay on fail is 10 minutes
    $thirdRetry = 60 * 60;   # Third delay on fail is 1 hour
    
    # SUCCESS CASE
    # e.g. retry ("success", "getPage", "0.23");
    if ($callType eq "success")
    {
	myLog("Response time for ".$args[0]." (sec): ".$args[1]."\n");
	$::retryDelay = $normalDelay;

	if ($args[0] eq "postPage")
	{
	    # If the response time is greater than 20 seconds...
	    if ($args[1] > 20)
	    {
		print "Wikipedia is very slow.  Increasing minimum wait to 10 min...\n";
		myLog("Wikipedia is very slow.  Increasing minimum wait to 10 min...\n");
		
		$::speedLimit = 60 * 10;
	    }

	    # If the response time is between 10 and 20 seconds...
	    elsif ($args[1] > 10)
	    {
		print "Wikipedia is somewhat slow.  Setting minimum wait to 60 sec...\n";
		myLog("Wikipedia is somewhat slow.  Setting minimum wait to 60 sec...\n");
		
		$::speedLimit = 60;
	    }

	    # If the response time is less than 10 seconds...
	    else
	    {
		if ($::speedLimit > 10)
		{
		    print "Returning to normal minimum wait time.\n";
		    myLog("Returning to normal minimum wait time.\n");
		    $::speedLimit = 10;
		}
	    }
	}
	return();
    }

    # e.g. retry ("getPage", "George_Washington")
    # FAILURE CASES
    elsif (($::retryDelay == $normalDelay)
	   or ($::retryDelay == 0))
    {
	print "First retry for ".$args[0]."\n";
	myLog("First retry for ".$args[0]."\n");
	$::retryDelay = $firstRetry;
	$::speedLimit = 60 * 10;
    }
    elsif ($::retryDelay == $firstRetry)
    {
	print "Second retry for ".$args[0]."\n";
	myLog("Second retry for ".$args[0]."\n");
	$::retryDelay = $secondRetry;
	$::speedLimit = 60 * 10;
    }
    elsif ($::retryDelay == $secondRetry)
    {
	print "Third retry for ".$args[0]."\n";
	myLog("Third retry for ".$args[0]."\n");
	$::retryDelay = $thirdRetry;
	$::speedLimit = 60 * 10;
    }
    elsif ($::retryDelay == $thirdRetry)
    {
	print "Nth retry for ".$args[0]."\n";
	myLog("Nth retry for ".$args[0]."\n");
	$::retryDelay = $thirdRetry;
	$::speedLimit = 60 * 10;
    }
    else
    {
	die ("retry(): Internal error - unknown delay factor '".$::retryDelay."'\n");
    }

    # DEFAULT TO FAILURE CASE HANDLING
    
    $i = $::retryDelay;
    while ($i >= 0)
    {
	sleep (1);
	print STDERR "Waiting $i seconds for retry...\r";
	$i--;
    }
    print "                                     \r";

    # DO THE ACTUAL RETRY
    if ($callType eq "getPage")
    {
	return(getPage(@args));
    }
    elsif ($callType eq "postPage")
    {
	return(postPage(@args));
    }
    elsif ($callType eq "getCategoryArticles")
    {
	return(getCategoryArticles(@args));
    }
    elsif ($callType eq "getSubcategories")
    {
	return(getSubcategories(@args));
    }
    elsif ($callType eq "getURL")
    {
	return(getURL(@args));
    }
    else
    {
	myLog ("retry(): Unknown callType: $callType\n");
	die ("retry(): Unknown callType: $callType\n");
    }
}



# perl pearle.pl PRINT_WIKITEXT Article_you_want_to_get
## Warning: Saves to a file in the current directory with the same name
## as the article, plus another file with the .html extention.
sub printWikitext
{
    my ($editTime, $startTime, $text, $target, $token);

    $target = $_[0];

    $target =~ s/^\[\[://;
    $target =~ s/\]\]$//;

    ($text, $editTime, $startTime, $token) = getPage($target);

    # Save the wikicode version to a file.
    open (WIKITEXT, ">./${target}");
    print WIKITEXT $text;
    close (WIKITEXT);

    # Save the HTML version to a file.
    print `wget http://en.wikipedia.org/wiki/${target} -O ./${target}.html`;
}


    1;  # don't forget to return a true value from the file

Main code edit

Invoke as (for example)

perl genClassTable.pl -d 2 -C Wikipedia:WikiProject_The_Beatles/Categories -a leaveOrdered.txt -q C:\shortprg\AWB\enwiki-20060303-categorylinks.sql -o bigone2c.txt >runlog2c.txt

Fillename: genClassTable.pl


#!/usr/bin/perl -w
#---------------------------------------------------------------------------#
# process files and generate a category table                       
# Author: Larry Pieniazek (IBM/Ascential Software) as hobby project
# Adapted from stuff I cribbed from all over.  
# (c)Larry Pieniazek 2006. This library is free software; you can redistribute it 
# and/or modify it under the same terms as Perl itself.  
# additionally, can be redistributed and modified under GFDL or CC-SA as you choose        
# 
# Abstract:
#	This perlscript is designed to parse category SQL dumps from wikipedia
#	which are found here: http://download.wikimedia.org/enwiki/
#   For example the 23 March dump is called 
#	  http://download.wikimedia.org/enwiki/20060323/enwiki-20060323-categorylinks.sql.gz
#   
#	The parsing is to generate article classification tables such as those found at
#	  http://en.wikipedia.org/wiki/Wikipedia_talk:WikiProject_The_Beatles/Article_Classification
#
#   In addition to the dump (currently must have been converted to linefeed delimited tuples)
#	the other input is a list of categories of interest, one per line.
#
#---------------------------------------------------------------------------#
use strict;

use WP_util_pearlebot;
use WP_util_ClassTable;

use Getopt::Std;
    

#---------------------------------------------------------------------------#
# Main routine -
#	process options
#	read in categories desired
#	build hash of articles by parsing SQL file
#	write out table file using hash           
#---------------------------------------------------------------------------#
# main
    my $rc=0;
    # print "prior to getopts\n";
	
    getopts('hvd:q:a:C:c:o:', \%WP_util_ClassTable::options) or WP_util_ClassTable::Usage(); # debug also d

    # print "post getopts, pre process\n";
    WP_util_ClassTable::ProcessOptions();
    		    		
    if ($WP_util_ClassTable::debug>1) { print "post process, pre read cat\n";	}
    
    if (defined $WP_util_ClassTable::options{'a'}) { # using page with art special key list
         WP_util_ClassTable::ReadArtKeyFile(); 
    } else { # no list to read so make it empty
        %WP_util_ClassTable::artSpecialKeyHash = ();
    }
    
    if (defined $WP_util_ClassTable::options{'C'}) { # using page with cat lists
        # $rc=FetchCatPage("Wikipedia:WikiProject_The Beatles/Categories");
        $rc=WP_util_ClassTable::FetchCatPage($WP_util_ClassTable::catArtPage);
        if ($rc) { die "error fetching category list from Wikipedia\n"; }
    } else {
        $rc=WP_util_ClassTable::ReadCatFile();
        if ($rc) { die "error reading category list\n"; }
    }

    $rc=WP_util_ClassTable::ParseSQL();
    if ($rc) { die "error reading SQL or building structure\n"; }
    
#    $rc=WP_util_ClassTable::WriteHash();
#    if ($rc) { die "error writing hash\n"; }    

    # exit 0;

    $rc=WP_util_ClassTable::WriteTable();
    if ($rc) { die "error building table\n"; }

exit 0;