User:FairuseBot/10c-removal.pl

#!/usr/bin/perl


# 10c-removal
#
# A bot to remove NFCC #10c-incompliant images from pages

use strict;
use warnings;

use Date::Calc;
use Data::Dumper;

use libBot;

my @common_links = ("Copyright", "Copyright infringement", "Fair use", "Logo", "Trademark", "United States copyright law", "Wikimedia",
                    "Computer game", "Counterfeit", "Currency", "Free software", "Portable Network Graphics", "Poster", "Public domain",
                    "Screenshot", "Station identification", "United States Code", "U.S. state", "Video game", "Wikimedia Foundation",
                    "Work of the United States Government");
my $common_links = join "|", @common_links;

my $test = 0;

my $homedir = '/home/mark/Desktop/wikibots/10cbot';
my $permit_interruptions = 1;	# Allow talkpage messages to stop the bot?

Pearle::init("FairuseBot", "", "$homedir/removebot.log","$homedir/removebot-cookies.txt");
Pearle::config(nullOK => 1, printlevel => 4);
config(username => "FairuseBot");

if(!Pearle::login())
{
	exit;
}

# Check for a running copy
if(-e "$homedir/pid")
{
	# Possible other copy.  Compare PIDs
	open PIDFILE, "<", "$homedir/pid";
	my $pid = <PIDFILE>;
	close PIDFILE;

	my $psresult = `ps -p $pid`;
	if($psresult =~ /10c-removal.pl/)
	{
		botwarnlog("*Previous run is taking longer than normal\n");
		exit;
	}
}

open PIDFILE, ">", "$homedir/pid";
print PIDFILE $$;
close PIDFILE;

my $total_images = 0;
my @logs;

{
	my @images;
	my $image;
	my $images_removed = 0;
	
	@images = ();
	
	Pearle::myLog(2, "Beginning set at " . time() . "\n");

	# Get the log
	if($test)
	{
		@images = ('Image:Dummy316.png');
	}
	else
	{
		my $CURRENT_DIR;
		my @files;
		# Scan the directory for log files
		opendir($CURRENT_DIR, $homedir) or (print "Failed: $!\n" and return);
		@files = readdir $CURRENT_DIR or (print "Failed: $!\n" and return);
		closedir $CURRENT_DIR;
		@files = grep {/^partial_failures.*txt$/} @files;
		foreach my $file (@files)
		{
			my ($year, $month, $day) = $file =~ /_(\d{4})-(\d{1,2})-(\d{1,2})/;
			if(Date::Calc::Delta_Days( $year, $month, $day, (Date::Calc::Today(1))) > 5)
			{
				open INFILE, "<:utf8", "$homedir/$file";
				my @new_images = <INFILE>;
				close INFILE;
				chomp @new_images;
				push @images, @new_images;
				
				push @logs, "$homedir/$file";
			}
		}
	}
		
	Pearle::myLog(3, join("\n", @images));
	Pearle::myLog(3, "\n" . scalar(@images) . " images found\n");
	
	if(scalar(@images) == 0)
	{
		Pearle::myLog(1, "*No images to remove\n");
	}

	foreach $image (@images)
	{
		my $image_url;
		my $image_regex = $image;
		my $page;
		
		my $full_comment = "";
		my $removal_prefix = "Image with inadequate rationale removed:";
		my $removal_comment = "Removing image with inadequate [[WP:NFCC|rationale]]";
		
		# Fetch image info
		Pearle::myLog(2, "Processing image $image\n");
		# Fetch the image data
		my $image_data;
		if($test)
		{
			$image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], 
							plnamespace => [0, 2],  							# Links
							rvprop => ['content'],							# Article body
							iiprop => ['user', 'comment', 'sha1'], iilimit => 500,			# Upload history
							meta => 'userinfo', uiprop => ['hasmsg'], 					# Check for talkpage messages
							list => 'imageusage', iutitle => $image, iunamespace => [0, 2], iulimit => 500);	# Image usage
		}
		else
		{
			$image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], 
							plnamespace => [0],	  							# Links
							rvprop => ['content'],							# Article body
							iiprop => ['user', 'comment', 'sha1'], iilimit => 500,			# Upload history
							meta => 'userinfo', uiprop => ['hasmsg'], 					# Check for talkpage messages
							list => 'imageusage', iutitle => $image, iunamespace => [0], iulimit => 500);	# Image usage
		}
		
		if(!defined($image_data))
		{
			Pearle::myLog(0, "Server did not return an appropriate response.  Exiting.\n");
			last;
		}
	
		# Extract the list of pages where it's used.
		my @pages = GetPageList($image_data);
		my $num_pages = scalar(@pages);
		my @failed_pages;
		# Extract the categories
		my @categories = GetPageCategories($image_data);
		# Extract a list of pages this image links to.
		my @links = GetPageLinks($image_data);
		# Filter out common links
		@links = grep {$_ !~ /^($common_links)$/} @links;

		if($permit_interruptions and DoIHaveMessages($image_data))
		{
			Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");
			exit;
		}
		
		# Sanity check: Does the image still exist?
		if($image_data =~ /missing=""/)
		{
			Pearle::myLog(2, "*Image [[:$image]] has been deleted.\n");
			next;
		}
		# Sanity check: Is this still tagged as non-free?
		if(!grep {$_ eq 'Category:All non-free media'} @categories)
		{
			Pearle::myLog(2, "*Image [[:$image]] is no longer marked as non-free.\n");
			next;
		}
		# Sanity check: Is the image used?
		if(scalar(@pages) == 0)
		{
			# Orphaned fairuse image
			Pearle::myLog(2, "*Image [[:$image]] is not used anywhere\n");
			# Is this image already disputed?
			if(grep {$_ eq 'Category:All disputed non-free images'} @categories)
			{
				Pearle::myLog(2, "*Image [[:$image]] is already marked for deletion.\n");
			}
			else
			{
				if(!grep {$_ eq 'Category:All orphaned fairuse images'} @categories)
				{
					my $text = "\n{{subst:orfud}}\n";
					wikilog($image, $text, "Non-free image is not used in any article\n");
				}
			}
			next;
		}
		# Sanity check: Is the image still tagged as disputed?
		if(!grep {$_ eq 'Category:All disputed non-free images'} @categories)
		{
			Pearle::myLog(2, "*Image [[:$image]] is not marked for deletion.\n");
			next;
		}
		
		# Remove the NFCC-failure tag and the list of pages
		# Blindly removing the tag is safe:
		# 1) If the program fails, 10cbot will pick the image up on its next pass
		# 2) If the image is orphaned, or will be orphaned by removal (unlikely), 10cbot or another bot will pick it up
		# 3) If the image is non-compliant on all pages, 10cbot will pick it up on the next pass
		my $wikipage = Pearle::getPage($image);
		my $text = $wikipage->getEditableText();
		$text =~ s/\x03\x44i-missing article links[^\x04]*\x04//s;
		Pearle::myLog(4, "Text after processing:\n$text\n");
		$wikipage->setEditableText($text);
		Pearle::postPage($wikipage, "Removing tag", 0);
		Pearle::limit();
		
		# Build the image-matching regex
		my ($raw_image) = $image =~ /Image:(.*)/;
		$raw_image = MakeWikiRegex($raw_image);
		if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg)$/i)
		{
			$image_regex = "[ _]*(:?[Ii][Mm][Aa][Gg][Ee]|[Mm][Ee][Dd][Ii][Aa])[ _]*:[ _]*${raw_image}[ _]*";
			Pearle::myLog(2, "*Non-image media file [[:$image]] found.\n");
			next;			# Non-image media are too hard to work with
		}
		else
		{
			$image_regex = "[ _]*[Ii][Mm][Aa][Gg][Ee][ _]*:[ _]*${raw_image}[ _]*";
		}
				
		# Sanity check
		if(!defined($raw_image) or $image !~ /$raw_image/)
		{
			botwarnlog("*Parse error on image [[:$image]] ($raw_image)\n");
			next;
		}
		Pearle::myLog(3, "Image regex: $image_regex\n");
		
		# Check for best-case compliance: each use has a matching direct link in the body of the text - tested
		Pearle::myLog(4, "Image is used in " . scalar(@pages) . " pages.\n");
		Pearle::myLog(4, "Image is used on " . join("|", @pages) . "\n");
		Pearle::myLog(4, "Image links to " . join("|", @links) . "\n");
		
		foreach my $page (@links)	# Filter out pages that match a link
		{
			@pages = grep {$_ ne $page} @pages;
		}
		Pearle::myLog(4, "Image failed best-case test for " . scalar(@pages) . " pages.\n");
		next if(scalar(@pages) == 0);
			
		# Check for liberal compliance:
		# For each use, remove it from the list if there's a case-insensitive match in the body text - tested
		foreach my $page (@pages)
		{
			my $page_match_regex = MakeWikiRegex($page);
			push @failed_pages, $page unless($text =~ /$page_match_regex/i);
		}
		@pages = @failed_pages;
		@failed_pages = ();
		
		Pearle::myLog(4, "Image failed text test for " . scalar(@pages) . " pages.\n");
		next if(scalar(@pages) == 0);
		
		# Check for strict compliance:
		# For each link, chase redirects - tested
		if(scalar(@links) > 0)
		{
			my $page_data = Pearle::APIQuery(titles => \@links, redirects => 1);
			my $parsed_xml = Pearle::getXMLParser()->XMLin($page_data);
			my @redirects;
			Pearle::myLog(4, Dumper($parsed_xml));
			if(exists($parsed_xml->{query}->{redirects}->{r}) and defined($parsed_xml->{query}->{redirects}->{r}))
			{
				if(ref($parsed_xml->{query}->{redirects}->{r}) eq 'ARRAY')
				{
					@redirects = @{$parsed_xml->{query}->{redirects}->{r}};
				}
				else
				{
					@redirects = ($parsed_xml->{query}->{redirects}->{r});
				}
			}
			foreach my $page (@pages)
			{
				my $matched = 0;
				foreach my $redirect (@redirects)
				{
					if($redirect->{to} eq $page)
					{
						# We can get there by a redirect
						UpdateLink($image, $redirect->{from}, $page);
						Pearle::limit();
						$matched = 1;
						last;
					}
				}
				if(!$matched)
				{
					push @failed_pages, $page;
				}
			}
			@pages = @failed_pages;
			@failed_pages = ();
		}
		
		Pearle::myLog(4, "Image failed redirect test for " . scalar(@pages) . " pages.\n");
		next if(scalar(@pages) == 0);
	
		# Check for near-compliance:
		# For each use, if we can get to it by means of a disambiguation page, update the link - tested
		foreach my $page (@links)
		{
			# Fetch the page text and page links
			my $page_data = Pearle::APIQuery(titles => [$page], prop => ['links', 'revisions'], 
						plnamespace => [2],  						# Links
						rvprop => ['content']);					# Article body
			# If the page text indicates disambig, see if any of the links is one we're looking for
			my $page_text = GetPageText($page_data);
			if($page_text =~ /{{disambig}}/i)
			{
				my @page_links = GetPageLinks($page_data);
				foreach my $disambig_link (@page_links)
				{
					if(grep {$_ eq $disambig_link} @pages)
					{
						# It's a match.  Remove it from the list
						@pages = grep {$_ ne $disambig_link} @pages;
						# Post to the page
						my $success = UpdateLink($image, $page, $disambig_link);
						if(!$success)
						{
							botwarnlog("*Failed to update disambiguation link for [[:$image]] from [[$page]] to [[$disambig_link]]\n");
						}
						Pearle::limit();
					}
				}
			}
		}
		
		Pearle::myLog(4, "Image failed disambiguation test for " . scalar(@pages) . " pages.\n");
		next if(scalar(@pages) == 0);
		
		# Test for compliance
		# Over-use (some compliant, some non-compliant): Remove from any non-compliant articles, OrphanBot-style.  Leave a note on the article talk page.
		if(scalar(@pages) > 0 and $num_pages > scalar(@pages))
		{
			Pearle::myLog(2, "Image $image failed on " . scalar(@pages) . " pages.\n");
			
			my $parsed_removal_comment = $removal_comment;
			$parsed_removal_comment =~ s/image/[[:$image|image]]/;
			foreach $page (@pages)
			{
				my $hits = 0;
				notelog("Page for removal: $page\n");
				if($hits = RemoveImageFromPage($image, $page, $image_regex, $removal_prefix, $parsed_removal_comment)) 	# Don't limit if we just touched the article
				{
					Pearle::myLog(2, "Removed image $image from article $page ($hits times)\n");
					Pearle::limit();
				}
				$images_removed += $hits;
			}
		}
		elsif(scalar(@pages) > 0)
		{
			# Fully-non-compliant.  Should never occur, but if it does, let 10cbot pick it up on the next pass.
			Pearle::myLog(2, "Image $image failed on all pages\n");
		}
		else
		{
			Pearle::myLog(2, "Image $image is now fully-compliant\n");
		}
	}
	Pearle::myLog(2, "Finished with set.  Removed $images_removed images.\n");
	$total_images += $images_removed;
}

# Remove the processed logs
unlink @logs;

unlink "$homedir/pid"