#!/usr/bin/perl fork and exit; use POE qw (Component::Client::TCP); use HTML::Entities; use LWP::UserAgent; use perlwikipedia; use strict; my $editor=Perlwikipedia->new("LinkParser","LinkParser"); my $diffFetcher=LWP::UserAgent->new; $diffFetcher->agent("LinkParser/2.0"); my %settings; $settings{'debug'} = 0; print "Reading config file...\n" if $settings{'debug'}; open (CONFIG,"<linkwatcher-config") or die "Can't open LinkWatcher config: $!"; foreach (<CONFIG>) { unless (/^#/) { if(/(.+?)=(.+)/) { $settings{$1}=$2; } } } close (CONFIG); my @prefixes = split(/\|/,$settings{'prefixes'}); print ("Prefixes: " . join(" - ", @prefixes) . "\n") if $settings{'debug'}; print "done\n" if $settings{'debug'}; my $server_port=shift; sleep 4; POE::Component::Client::TCP->new( RemoteAddress =>'127.0.0.1', RemotePort => $server_port, ServerInput => \&server_input, Connected => \&connected, ); my ($heap,$kernel); my $number_of_edits=0; POE::Kernel->run(); exit 0; sub server_input { my ( $session, $heap, $kernel, $input ) = @_[ SESSION, HEAP, KERNEL, ARG0 ]; if ($input =~ m{EDIT \[\[(.+)\]\] \[\[(.+):User:(.+?)\]\] (http:\/\/.+) (.+)}) { my ($pagename) = $1; my ($lang) = $2; my ($username) = $3; my ($diffurl) = $4; my ($size) = $5; $number_of_edits++; my @linksadded; my @linkremoved; if ( $diffurl ) { my @addedPre = (); my @removedPre = (); my @addedlinks = (); my @removedlinks = (); my $addedTotal = ""; my $removedTotal = ""; if ($diffurl =~ m/index\.php/) { my $diffUrl="$diffurl&diffonly=1&action=render"; my $diffContent=$diffFetcher->get($diffUrl)->content; print ("$diffContent\n") if $settings{'debug'}; @addedPre=$diffContent=~m/<td class=.diff-addedline.><div>(.*?)<\/div><\/td>/sg; @removedPre=$diffContent=~m/<td class=.diff-deletedline.><div>(.*?)<\/div><\/td>/sg; $addedTotal=join(' ', @addedPre); $removedTotal=join(' ', @removedPre); $addedTotal =~ s/<span class=.diffchange diffchange-inline.>//g; $addedTotal =~ s/<span class=.diffchange.>//g; $addedTotal =~ s/<\/span>//g; $removedTotal =~ s/<span class=.diffchange diffchange-inline.>//g; $removedTotal =~ s/<span class=.diffchange.>//g; $removedTotal =~ s/<\/span>//g; $addedTotal =~ s/<ins class=.diffchange diffchange-inline.>//g; $addedTotal =~ s/<ins class=.diffchange.>//g; $addedTotal =~ s/<\/ins>//g; $removedTotal =~ s/<ins class=.diffchange diffchange-inline.>//g; $removedTotal =~ s/<ins class=.diffchange.>//g; $removedTotal =~ s/<\/ins>//sig; $addedTotal = lc($addedTotal); $removedTotal = lc($removedTotal); print ("Added data: $addedTotal\n") if $settings{'debug'}; } else { $addedTotal=$editor->get_text($pagename); $addedTotal= lc($addedTotal); $removedTotal = ""; } decode_entities( $addedTotal ); decode_entities( $removedTotal ); @addedlinks=$addedTotal=~m{(http://[^\s\]\[\{\}\\\|^~`<>]+)}sgi; @removedlinks=$removedTotal=~m{(http://[^\s\]\[\{\}\\\|^~`<>]+)}sgi; my @really_added_links = (); my @really_removed_links = (); my $links_added; my $links_removed; if (@addedlinks) { if (@removedlinks) { print("----\nDIFF $diffurl ".join(" ",@addedlinks)." - ".join(" ",@removedlinks)."\n----\n") if $settings{'debug'}; foreach $links_added(@addedlinks) { my $found = 0; foreach $links_removed(@removedlinks) { if ($links_removed eq $links_added) { $found = 1; } } unless ($found) { push(@really_added_links,$links_added); } } } else { @really_added_links = @addedlinks; } print ("DIFF $diffurl ".join(" ",@really_added_links)."\n----\n") if $settings{'debug'}; } if (@really_added_links) { my $message="PARSED [[$pagename]] $diffurl $size [[$lang:User:$username]] |" . join(" ",@really_added_links) . "|"; $heap->{server}->put($message); } } $heap->{server}->put("REQUEST"); } elsif ($input =~ m{NOEDIT}) { sleep 1; $heap->{server}->put("REQUEST"); } if ($number_of_edits>50) { $kernel->post("shutdown"); exit 0; } } sub connected { ( $kernel, $heap ) = @_[ KERNEL,HEAP ]; $heap->{server}->put("REQUEST"); } sub request_edit { ( $kernel, $heap ) = @_[ KERNEL,HEAP ]; $heap->{server}->put("REQUEST"); }