#!/usr/bin/perl -w use strict; use warnings; use CGI; use CGI::Carp "fatalsToBrowser"; use LWP::Simple; use LWP::UserAgent; use HTTP::Request; use HTTP::Request::Common qw(GET); use HTTP::Response; #______________________________________________________________________________# # PockBot.pl Version 0.01 PRE_RELEASE # # Author Dan Adams , (User:PocklingtonDan) # # Created 29/11/06 Last Modified 04/12/06 # #______________________________________________________________________________# #______________________________________________________________________________# # RIGHTS MANAGEMENT ETC # # # # The source code for PockBot is supplied solely for the purposes of allowing # # other editors to comment on and improve the code, and/or to run the code as # # a clone. It may be distributed and modified as required for these purposes. # #______________________________________________________________________________# #______________________________________________________________________________# # CHANGES STILL TO MAKE # # # # - timesouts on larger categories # # - make it write to wiki talk page for the category in scrollbox # # - "PockBot ran successfully" presented even if category non-existent # # # # RECENT CHANGES # # # # 05.12.06 - Version 0.01 source code released # #______________________________________________________________________________# #______________________________________________________________________________# # WHAT THE SCRIPT DOES # # # # This script is a wikipedia bot. It acts as a web spider. Given a wikipedia # # category page to start from, it finds all articles listed in that category # # as well as all subcategories of that category. For every subcategory it # # pulls a list of articles. For all articles retrieved (a list of all articles # # in that category and its subcategories) it then retrieves the CLASS flag for # # each page from wikipedia. It then presents these resulsts in tabulated form. # # # # INTENDED USE # # # # It is intended that this script would be useful to those trying to monitor # # all pages within a category for purposes of administration or for a project # # in order to monitor which articles need bringing up from stub or start class # # to full article status. # # # # CODE FORMATTING # # # # Code is formatted for ease of editing with Textad (www.textpad.com) or # # similar editor with colour-coding meta-markup. It may be difficult to scan # # using a no-frills text editor. # #______________________________________________________________________________# #______________________________________________________________________________# # MAIN ROUTINE # #______________________________________________________________________________# use CGI qw(:standard Vars); my $action = param('action') || 'startBot'; if ($action eq 'intro') {&startBot;} elsif ($action eq 'disableBot') {&disableBot;} elsif ($action eq 'enableBot') {&enableBot;} elsif ($action eq 'getMainCategory') {&getMainCategory;} else {&error("Unrecognised action request");} exit; #______________________________________________________________________________# # SUBROUTINES # #______________________________________________________________________________# sub startBot { &checkIfBotOnline; &logAction("Bot requested"); &printOnlineHeader; print "<p><font face=\"arial\">Please enter the wikipedia Category you wish to process below:</font></p>"; print "<FORM action=\"http://ccgi.thepaty.plus.com/cgi-bin/PockBot.cgi\" method=\"post\">"; print "<p><font face=\"arial\">Category:"; print "<INPUT type=\"text\" style=\"font-family: arial, serif; font-size: 12px;\" size=\"50\" name=\"category_specified\" value=\"Enter category name here!\"></font></p>"; print "<p><font face=\"arial\"><b>Note: Bot may take up to an hour to run for large categories.</b></font></p>"; print "<INPUT type=\"hidden\" name=\"action\" value=\"getMainCategory\">"; print "<INPUT type=\"submit\" value=\"Send\">"; print " </FORM>"; &printFooter; } #______________________________________________________________________________# sub getArticlesinCategory { my $content_articles = $_[0]; &logAction("Searching for articles in this category "); # if its not a wikipedia category page, return empty array unless ($content_articles =~ m/<div id="mw-pages">/){ $content_articles = ""; my @found_articles = split(/\|/,$content_articles); &logAction("Found 0 articles in this category "); return (@found_articles); } # empty array if no articles, else populate with article names if ($content_articles =~ m/There are 0 pages in this section of this category/){ $content_articles = ""; &logAction("Found 0 articles in this category"); } else { $content_articles =~ s/[\s\S]*<div id="mw-pages">//; $content_articles =~ s/<\/div>[\s\S]*/<\/div>/; $content_articles =~ s/[\s\S]*?<ul>/<ul>/; $content_articles =~ s/<h3>[\s\S]*?<\/h3>//g; $content_articles =~ s/<ul>//g; $content_articles =~ s/<\/ul>//g; $content_articles =~ s/<td>//g; $content_articles =~ s/<\/td>//g; $content_articles =~ s/<\/div>//g; $content_articles =~ s/<\/tr>//g; $content_articles =~ s/<\/table>//g; $content_articles =~ s/<\/li>/|/g; $content_articles =~ s/<li>/|/g; $content_articles =~ s/\n//g; $content_articles =~ s/\|\|/\|/g; $content_articles =~ s/<a[\s\S]*?>//g; $content_articles =~ s/<\/a>//g; $content_articles =~ s/\|$//; $content_articles =~ s/^\|//; $content_articles =~ s/_/ /g; $content_articles =~ s/\s\|/\|/g; &logAction("Found 1 or more articles in this category"); } my @found_articles = split(/\|/,$content_articles); return (@found_articles); } #______________________________________________________________________________# sub getSubCatsinCategory { my $content_subcats = $_[0]; &logAction("Searching for subcats in this category"); # if its not a wikipedia category page, empty array unless ($content_subcats =~ m/<div id="mw-subcategories">/){ $content_subcats = ""; my @found_subcats = split(/\|/,$content_subcats); &logAction("Found 0 subcats in this category"); return (@found_subcats); } # empty array if no subcats, else populate with subcat names if ($content_subcats =~ m/There are 0 subcategories to this category/){ $content_subcats = ""; &logAction("Found 0 subcats in this category"); } else { $content_subcats =~ s/[\s\S]*<div id="mw-subcategories">//; $content_subcats =~ s/<div id="mw-pages">[\s\S]*//; $content_subcats =~ s/<h3>[\s\S]*?<\/h3>//g; $content_subcats =~ s/<div[\s\S]*?>//g; $content_subcats =~ s/<\/div>//g; $content_subcats =~ s/<span[\s\S]*?<\/span>//g; $content_subcats =~ s/[\s\S]*?<ul>/<ul>/; $content_subcats =~ s/<ul>//g; $content_subcats =~ s/<\/ul>//g; $content_subcats =~ s/<\/li>/|/g; $content_subcats =~ s/<li>/|/g; $content_subcats =~ s/<a[\s\S]*?>//g; $content_subcats =~ s/<\/a>//g; $content_subcats =~ s/\n//g; $content_subcats =~ s/\|\|/\|/g; $content_subcats =~ s/<td>//g; $content_subcats =~ s/<\/td>//g; $content_subcats =~ s/<\/tr>//g; $content_subcats =~ s/<\/table>//g; $content_subcats =~ s/[\s]*?\|/\|/g; $content_subcats =~ s/\|$//; $content_subcats =~ s/^\|//; $content_subcats =~ s/\|\|/\|/g; &logAction("Found 1 or more subcats in category $content_subcats"); } my @found_subcats = split(/\|/,$content_subcats); return (@found_subcats); } #______________________________________________________________________________# sub processContents { my $category = $_[0]; my $contents = $_[1]; $category =~ s/_/ /g; &logAction("Starting to process category $category"); #Seperate the page generation from spider work use threads; use Config; if ($Config{useithreads}) { # We have threads # Let user know spider is on the job. &logAction("Notifying user bot starting"); &printOnlineHeader; print "<h3><font face=\"arial\">PockBot is now running - DO NOT CLOSE THIS WINDOW</font></h3>"; print "<p><font face=\"arial\">Thank you for using PockBot. You wanted a list of article classes for "; print " wikipedia category <a href=\"http://en.wikipedia.org/wiki/Category:$category\">$category</a>.</font></p>"; print "<p><font face=\"arial\">The content will take some time to generate. When complete, the results will be posted to wikipedia for you.<br>"; print "In respect for wikipedia's servers, PockBot will only make one read request to wikipedia servers every second<br>"; print "PockBot can read 3600 pages an hour under ideal network conditions. Large categories may therefore take up to an hour to run</font></p>"; print "<p><font face=\"arial\"><b>Progress:</b><br>PockBot is running...<br></font></p>"; &printFooter; # Set spider to work on requested category, in separate thread my $threadForSpidering = threads->new(\&workthread, $category, $contents); #$threadForSpidering->detach; my @listOfAllArticlesFound = $threadForSpidering->join; } else { &error("PockBot requires threads. This perl installation is not built with threads activated. PockBot cannot run."); } } #______________________________________________________________________________# sub removeDuplicates { my @articles = @_; my @articles_no_duplicates = (); &logAction("Removing duplicates from found articles list."); foreach my $suggested_article (@articles) { my $already_exists = 0; foreach my $existing_article (@articles_no_duplicates) { if ($suggested_article eq $existing_article) { $already_exists = 1; } } if ($already_exists == 0) { push(@articles_no_duplicates, $suggested_article); } } return (@articles_no_duplicates); } #______________________________________________________________________________# sub getAllArticlesIn { my @subcats = @_; my @new_articles = (); foreach my $individual_subcat (@subcats) { &logAction("Searching for new articles in subcat $individual_subcat"); my ($subcategory, $subcategorycontents) = fetchContents($individual_subcat); my @found_articles = getArticlesinCategory($subcategorycontents); foreach my $found_article (@found_articles) { push(@new_articles, $found_article); } } return (@new_articles); } #______________________________________________________________________________# sub getArticleClasses { my @articles_no_duplicates = @_; my %classes = (); foreach my $article_title (@articles_no_duplicates) { my ($talkpage, $contents) = fetchTalkContents($article_title); my $class = "?? (unclassified)"; &logAction("Getting article class for article $article_title"); if ($contents =~ m/[\s\S]*class=Start[\s\S]*/) { $class = "Start"; } if ($contents =~ m/[\s\S]*class= Start[\s\S]*/) { $class = "Start"; } if ($contents =~ m/[\s\S]*class=Stub[\s\S]*/) { $class = "Stub"; } if ($contents =~ m/[\s\S]*class= Stub[\s\S]*/) { $class = "Stub"; } if ($contents =~ m/[\s\S]*class=A[\s\S]*/) { $class = "A"; } if ($contents =~ m/[\s\S]*class= A[\s\S]*/) { $class = "A"; } if ($contents =~ m/[\s\S]*class=B[\s\S]*/) { $class = "B"; } if ($contents =~ m/[\s\S]*class= B[\s\S]*/) { $class = "B"; } if ($contents =~ m/[\s\S]*class=FA[\s\S]*/) { $class = "FA"; } if ($contents =~ m/[\s\S]*class= FA[\s\S]*/) { $class = "FA"; } # add details of article class to hash $classes{'$article_title'} = $class; } return (%classes) } #______________________________________________________________________________# sub writeResultsToFile { my $category = $_[0]; my $text_to_print = $_[1]; my $results_file = '/files/home2/thepaty/cgi-bin/results.htm'; &logAction("Writing bot results to file."); open(RESULTSFILE,">$results_file") || &error("Cannot open bot results file."); flock(RESULTSFILE, 2) || &error("Cannot lock bot results file."); print RESULTSFILE "$text_to_print"; flock(RESULTSFILE, 8); close (RESULTSFILE); } #______________________________________________________________________________# sub getTimeStamp { my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); my @weekDays = qw(Sun Mon Tue Wed Thu Fri Sat Sun); my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime(); my $year = 1900 + $yearOffset; my $timeNow = "$hour:$minute:$second, $weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year"; return ($timeNow); } #______________________________________________________________________________# sub logAction { my $actionToLog = $_[0]; my $log_file = '/files/home2/thepaty/cgi-bin/log.htm'; my $timeStamp = getTimeStamp(); open(LOGFILE,">>$log_file") || &error("Cannot open log file."); flock(LOGFILE, 2) || &error("Cannot lock log file."); print LOGFILE "$timeStamp: $actionToLog<br>"; flock(LOGFILE, 8); close (LOGFILE); } #______________________________________________________________________________# sub workthread { my $category = $_[0]; my $contents = $_[1]; &logAction("Starting work thread for category $category"); my @subcats = getSubCatsinCategory($contents); my @articles = getArticlesinCategory($contents); my $new_subcats_found_this_round = 1; my @subcats_searched_aleady = (); # Keep searching until no new subcats are found.in any categories searched while ($new_subcats_found_this_round > 0) { $new_subcats_found_this_round = 0; my @proposed_extra_subcats = (); # Perform a search of every category we currently know of foreach my $existing_subcat (@subcats) { my $already_searched = 0; # If already searched this category in an earlier pass, skip it. foreach my $searched_subcat (@subcats_searched_aleady) { if ($existing_subcat eq $searched_subcat) { $already_searched = 1; } } # If not already searched, get all subcats of that category if ($already_searched == 0) { &logAction("Have not searched subcat $existing_subcat already"); my ($subcategory, $subcategorycontents) = fetchContents($existing_subcat); my @additional_subcats = getSubCatsinCategory($subcategorycontents); foreach my $proposed_additional_subcat (@additional_subcats) { push(@proposed_extra_subcats, $proposed_additional_subcat); &logAction("Found possible new subcat $proposed_additional_subcat"); } push(@subcats_searched_aleady, $existing_subcat); } else { &logAction("Have searched subcat $existing_subcat already"); } } # If this new found subcat isn't a duplicate of one we already know about... foreach my $proposed_new_subcat (@proposed_extra_subcats) { my $already_exists = 0; foreach my $existing_subcat (@subcats) { if ($proposed_new_subcat eq $existing_subcat) { $already_exists = 1; } } # then add it to our master list if ($already_exists == 0) { &logAction("subcat $proposed_new_subcat is a genuinely new subcategory, adding to master list"); push(@subcats, $proposed_new_subcat); $new_subcats_found_this_round++; } else { &logAction("subcat $proposed_new_subcat already existed in master list, ignoring"); } } &logAction("$new_subcats_found_this_round new subcats found this round. If greater than zero, should run through again"); } # And now get a list of every article in every subcat my @new_articles = getAllArticlesIn(@subcats); my @articles = (@articles, @new_articles); # Remove duplicates from article list. my @articles_no_duplicates = removeDuplicates(@articles); # Search talk pages for each article to find "class=X" classification my %classes = getArticleClasses(@articles_no_duplicates); # Prepare text to print to results file my $text_to_print = "<p><font face=\"arial\">Pages in category <a href=\"http://en.wikipedia.org/wiki/Category:$category\">$category</a>"; $text_to_print .= " retrieved by <a href=\"http://en.wikipedia.org/wiki/User:PockBot\">PockBot</a>.<br><br>"; $text_to_print .= "<b>PockBot is currently In Development and the below does not represent final output.</b></font></p>"; $text_to_print .= "<table border=\"1\" cellpadding=\"2\"><tr>"; $text_to_print .= "<th style=\"background:#ffdead;\" width=\"350\">Article</th>"; $text_to_print .= "<th style=\"background:#ffdead;\" width=\"100\">Class / Status</th></tr>"; foreach my $article_title (@articles_no_duplicates) { $text_to_print .= "<tr>"; $text_to_print .= "<td><a href=\"http://en.wikipedia.org/wiki/$article_title\" title=\"$article_title\">"; $text_to_print .= "$article_title</a></td><td>$classes{'$article_title'}</td>"; $text_to_print .= "</tr>"; } $text_to_print .= "</table>"; # write results to results.htm &writeResultsToFile($text_to_print,$category); return (@articles_no_duplicates); } #______________________________________________________________________________# sub fetchContents { my $category = $_[0]; $category =~ s/\s/_/g; my $category_url = "http://en.wikipedia.org/wiki/Category:" . $category; &logAction("Fetching page contents for category $category"); my $browser = LWP::UserAgent->new(); $browser->timeout(60); my $request = HTTP::Request->new(GET => $category_url); my $response = $browser->request($request); if ($response->is_error()) {printf "%s\n", $response->status_line;} my $contents = $response->content(); sleep(1); # don't hammer the server! One read request every 1 second. return($category,$contents); } #______________________________________________________________________________# sub fetchTalkContents { my $category = $_[0]; $category =~ s/\s/_/g; my $category_url = "http://en.wikipedia.org/wiki/Talk:" . $category; &logAction("Fetching talk page contents for category $category"); my $browser = LWP::UserAgent->new(); $browser->timeout(60); my $request = HTTP::Request->new(GET => $category_url); my $response = $browser->request($request); if ($response->is_error()) {printf "%s\n", $response->status_line;} my $contents = $response->content(); sleep(1); # don't hammer the server! One read request every 1 second. return($category,$contents); } #______________________________________________________________________________# sub finishedRunning { my $category = $_[0]; my $category_url = "http://en.wikipedia.org/wiki/Talk:" . $category; &logAction("Finished processing category $category"); print "<p><font face=\"arial\">PockBot has finished running. The results should be visible on the talk page at <a href=\"$category_url\">Category_talk:$category</a></font></p>"; &printFooter; } #______________________________________________________________________________# sub resetLogAndResultsFiles { my $log_file = '/files/home2/thepaty/cgi-bin/log.htm'; my $results_file = '/files/home2/thepaty/cgi-bin/results.htm'; &logAction("Resetting log and results files to empty"); open(LOGFILE,">$log_file") || &error("Cannot open log file."); flock(LOGFILE, 2) || &error("Cannot lock log file."); print LOGFILE ""; flock(LOGFILE, 8); close (LOGFILE); open(RESULTSFILE,">$results_file") || &error("Cannot open log file."); flock(RESULTSFILE, 2) || &error("Cannot lock log file."); print RESULTSFILE ""; flock(RESULTSFILE, 8); close (RESULTSFILE); } #______________________________________________________________________________# sub getMainCategory{ my $category = "BLANK"; $category = param('category_specified'); &resetLogAndResultsFiles(); &logAction("Bot started for category $category"); if ($category eq "BLANK") { &error("Error receiving category name"); } else { my ($category, $contents) = fetchContents($category); &processContents($category,$contents); &finishedRunning($category); } } #______________________________________________________________________________# sub enableBot { my $status_file = "/files/home2/thepaty/cgi-bin/status.txt"; &logAction("Bot enable request made"); open(STATUSFILE,"$status_file") || &error("Cannot open bot status file."); flock(STATUSFILE, 2) || &error("Cannot lock bot status file."); my $current_status = <STATUSFILE>; flock(STATUSFILE, 8); close (STATUSFILE); chomp($current_status); my $bot_enabled = $current_status; if ($bot_enabled == 1) { &logAction("Bot already enabled, no action necesary"); &printOnlineHeader; print "<p><font face=\"arial\">PockBot is already enabled. <a href=\"http://ccgi.thepaty.plus.com/cgi-bin/PockBot.cgi?action=disableBot\">Disable PockBot</a></font></p>"; &printFooter; exit; } elsif ($bot_enabled == 0) { &logAction("Bot currently disabled. Enabling bot."); open(STATUSFILE,">$status_file") || &error("Cannot open bot status file."); flock(STATUSFILE, 2) || &error("Cannot lock bot status file."); print STATUSFILE "1"; flock(STATUSFILE, 8); close (STATUSFILE); &printOnlineHeader; print "<p><font face=\"arial\">PockBot is now enabled. <a href=\"http://ccgi.thepaty.plus.com/cgi-bin/PockBot.cgi?action=disableBot\">Disable Pockbot</a></font></p>"; &printFooter; exit; } else { &error("Unrecognised bot status. Something has gone wrong."); } } #______________________________________________________________________________# sub disableBot { my $status_file = "/files/home2/thepaty/cgi-bin/status.txt"; &logAction("Bot disable request made"); open(STATUSFILE,"$status_file") || &error("Cannot open bot status file."); flock(STATUSFILE, 2) || &error("Cannot lock bot status file."); my $current_status = <STATUSFILE>; flock(STATUSFILE, 8); close (STATUSFILE); chomp($current_status); my $bot_enabled = $current_status; if ($bot_enabled == 0) { &logAction("Bot is already disabled. No action necessary"); &printOfflineHeader; print "<p><font face=\"arial\">PockBot is already disabled. <a href=\"http://ccgi.thepaty.plus.com/cgi-bin/PockBot.cgi?action=enableBot\">Enable PockBot</a></font></p>"; &printFooter; exit; } elsif ($bot_enabled == 1) { &logAction("Bot is currently enabled. Disabling bot."); open(STATUSFILE,">$status_file") || &error("Cannot open bot status file."); flock(STATUSFILE, 2) || &error("Cannot lock bot status file."); print STATUSFILE "0"; flock(STATUSFILE, 8); close (STATUSFILE); &printOfflineHeader; print "<p><font face=\"arial\">PockBot is now disabled. <a href=\"http://ccgi.thepaty.plus.com/cgi-bin/PockBot.cgi?action=enableBot\">Enable Pockbot</a></font></p>"; &printFooter; exit; } else { &error("Unrecognised bot status. Something has gone wrong."); } } #______________________________________________________________________________# sub checkIfBotOnline { my $status_file = '/files/home2/thepaty/cgi-bin/status.txt'; &logAction("Checking if bot is online"); open(STATUSFILE,"$status_file") || &error("Cannot open bot status file."); flock(STATUSFILE, 2) || &error("Cannot lock bot status file."); my $current_status = <STATUSFILE>; flock(STATUSFILE, 8); close (STATUSFILE); chomp($current_status); my $bot_enabled = $current_status; if ($bot_enabled == 0) { &logAction("Bot is disabled, cannot perform action"); &printOfflineHeader; print "<p><font face=\"arial\">PockBot is currently disabled. If you are certain it has nt been disabled for a reason, you can <a href=\"http://ccgi.thepaty.plus.com/cgi-bin/PockBot.cgi?action=enableBot\">Enable PockBot</a></font></p>"; &printFooter; exit; } elsif ($bot_enabled == 1) { &logAction("Bot is enabled, we are good to go."); #no action necessary } else { &error("Unrecognised bot status. Something has gone wrong."); } } #______________________________________________________________________________# sub printOnlineHeader { print "Content-type: text/html\n\n"; print "<html><head><title>PockBot</title></head><body>"; print "<p><font face=\"arial\" size=\"1\"><a href=\"http://en.wikipedia.org/wiki/Main_Page\">Wikipedia</a> > <a href=\"http://en.wikipedia.org/wiki/User:PockBot\">Pockbot's User Page</a></font></p>"; print "<p><font face=\"arial\"><b>Pockbot is currently ONLINE / ENABLED</b> (<a href=\"http://ccgi.thepaty.plus.com/cgi-bin/PockBot.cgi?action=disableBot\">Disable PockBot</a>)</font><p>"; print "<img src=\"http://www.thepaty.plus.com/pockbot.gif\">"; } sub printOfflineHeader { print "Content-type: text/html\n\n"; print "<html><head><title>PockBot</title></head><body>"; print "<p><font face=\"arial\" size=\"1\"><a href=\"http://en.wikipedia.org/wiki/Main_Page\">Wikipedia</a> > <a href=\"http://en.wikipedia.org/wiki/User:PockBot\">Pockbot's User Page</a></font></p>"; print "<p><font face=\"arial\"><b>Pockbot is currently OFFLINE / DISABLED</b> (<a href=\"http://ccgi.thepaty.plus.com/cgi-bin/PockBot.cgi?action=enableBot\">Enable PockBot</a>)</font><p>"; print "<img src=\"http://www.thepaty.plus.com/pockbot.gif\">"; } #______________________________________________________________________________# sub printFooter { print "</body></html>"; } #______________________________________________________________________________# sub error { &checkIfBotOnline; &logAction("ERROR: $_[0]"); &printOnlineHeader; print "<p><font face=\"arial\">ERROR: $_[0]</font></p>"; &printFooter; exit; }