User:AnomieBOT/source/d/Sections.pm

See /doc for formatted documentation
package d::Sections;

use utf8;
use strict;
use Digest::SHA qw/sha256_base64/;
use AnomieBOT::API;
AnomieBOT::API::load('d::Templates');
AnomieBOT::API::load('d::Redirects');
AnomieBOT::API::load('d::IWNS');

use vars qw/@ISA/;
@ISA=qw/d::Templates d::Redirects d::IWNS/;

=pod

=head1 NAME

d::Sections - AnomieBOT section manipulation decorator

=head1 SYNOPSIS

 use AnomieBOT::API;

 $api = new AnomieBOT::API('conf.ini', 1);
 $api->decorators(qw/d::Sections/);

=head1 DESCRIPTION

C<d::Sections> contains functions to manage sections in wikitext for use by an
AnomieBOT task. When "d::Sections" is used as a decorator on the API object,
the following methods are available.

Also, all the methods provided by A<d::Templates>, A<d::Redirects>, and
A<d::IWNS> are available, as they are used internally.

=head1 METHODS PROVIDED

=over

=item $api->split_sections( $wikitext )

=item $api->split_sections( $wikitext, $levels )

Splits the wikitext into an array of section object. If C<$levels> is provided,
it should contain the digits 1-6 representing the heading levels to consider.
For example, "24" would split on "==" and "====" sections, but ignore "===" and
all others.

A section object has the following properties:

=over

=item level

The level of the section, i.e. a number 1-6. If there is any text before the
first section heading, that section will have undef here.

=item title

The title of the section. If there is any text before the first section
heading, that section will have an empty string here.

=item titlespaced

A boolean, indicating whether there are spaces between the =s and the title.

=item titlecomment

If there is a comment after the title, it is contained here.

=item body

The body text of the section. Depending on the C<$levels> used, this may
contain other section headers.

=back

=item $api->join_sections( @sections )

Joins an array of section objects back together.

=cut

sub split_sections {
    my $api=shift;
    my $text=shift;
    my $levels=shift || '123456';
    $levels="$levels";

    my ($outtxt,$nowiki)=$api->strip_nowiki($text);
    my $comments=[];
    while(my ($k,$v)=each(%$nowiki)){
        push @$comments, $k if $v=~/^<!--/;
    }
    $comments=join("|", @$comments);

    my $eq_re=[];
    for(my $i=1; $i<=6; $i++){
        push @$eq_re, '='x$i if index($levels, "$i")>=0;
    }
    $eq_re='(?:'.join('|', @$eq_re).')';

    my @sections=();
    my @split=("", undef, "", split /(?:^|(?<=\n))($eq_re)([^=\n](?:.*[^=\n])?)\1((?:[ \t]*$comments)*[ \t]*)(?:\n|$)/, $outtxt);
    for(my $i=0; $i<@split; $i+=4){
        my $level=length($split[$i+0]);
        $level=undef if $level==0;
        my $t=$api->replace_nowiki($split[$i+1] // '', $nowiki);
        my $title=$t;
        $title=~s/^\s+|\s+$//g;
        my $comment=$api->replace_nowiki($split[$i+2] // '', $nowiki);
        $comment=~s/^\s+|\s+$//g;
        my $body=$api->replace_nowiki($split[$i+3] // '', $nowiki);
        next if(!defined($level) && $body eq '');
        push @sections, {
            title => $title,
            level => $level,
            titlespaced => ($t ne $title),
            titlecomment => $comment,
            body => $body,
        };
    }
    return @sections;
}

sub join_sections {
    my $api=shift;

    my $out='';
    foreach (@_){
        $out.="\n" if($out && $out!~/\n\n$/);
        if(defined($_->{'level'})){
            $out.='='x$_->{'level'};
            $out.=' ' if $_->{'titlespaced'}//0;
            $out.=$_->{'title'};
            $out.=' ' if $_->{'titlespaced'}//0;
            $out.='='x$_->{'level'};
            $out.=$_->{'titlecomment'}//'';
            $out.="\n";
        }
        $out.=$_->{'body'};
        $out.="\n" unless $out=~/\n$/;
    }
    return $out;
}

=pod

=item $api->extract_end_content( $text )

Extracts the "end content" from the passed article text. This consists of the
lines containing only whitespace, categories, interlanguage links, comments,
and templates (with some exceptions) from the end of the article. In a list
context, returns a 2-element list consisting of the portion of C<$text> that is
not end content and the portion that is; in a scalar context, returns just the
portion that is.

The list of exceptions can be set using C<< $api->set_non_end_templates() >>.

=item $api->non_end_templates

=item $api->non_end_templates( @templates )

Get or set the list of non-end templates.

=cut

my @default_non_end_templates=(
    'Template:Reflist',
    'Template:Refend',
    'Template:MediaWiki',
    'Template:Meta',
    'Template:Wikibooks',
    'Template:Cookbook',
    'Template:Wikibookspar',
    'Template:Commons',
    'Template:Commons cat',
    'Template:Commonsimages',
    'Template:Commonsimages cat',
    'Template:Wikinews',
    'Template:Wikinewscat',
    'Template:Wikinewsportal',
    'Template:Wikinewspar2',
    'Template:Wikinewshas',
    'Template:Wikiquote',
    'Template:Wikisource',
    'Template:Wikisourcelang',
    'Template:Wikisourcelangold',
    'Template:Wikisource author',
    'Template:Wikisource index',
    'Template:Wikisourcecat',
    'Template:Wikisource1911Enc',
    'Template:Wikisource1913CatholicEnc',
    'Template:WikisourceEBD1897',
    'Template:Wikisource1914NSRW',
    'Template:Wikispecies',
    'Template:Wikiversity',
    'Template:Wikiversity2',
    'Template:Wikiversity3',
    'Template:Wikiversity-bc',
    'Template:Wikiversity-r',
    'Template:WVS',
    'Template:WVD',
    'Template:Wiktionary',
    'Template:Wiktionarypar',
    'Template:Wiktionary-inline',
    'Template:Sisterlinks',
    'Template:Sisterlinkqsc',
);

sub non_end_templates {
    my $api=shift;
    my @old=@{$api->{'$d::Sections::non_end_templates'}};
    $api->{'$d::Sections::non_end_templates'}=[@_] if @_;
    return @old;
}

sub extract_end_content {
    my $api=shift;
    my $pre=shift;

    return undef unless $api->load_IWNS_maps();
    my %il=$api->interlanguage_map();
    my %ns=$api->namespace_reverse_map(1);
    $api->{'$d::Sections::non_end_templates'}=\@default_non_end_templates unless exists($api->{'$d::Sections::non_end_templates'});

    # First pass, load the list of templates used in the page.
    my %templates=();
    $pre=$api->process_templates($pre, sub {
        my $name=shift;
        $templates{"Template:$name"}=1;
        return undef;
    });

    # Next, find the redirect targets for each.
    my %skip=();
    my %map=$api->resolve_redirects(keys %templates);
    while(my ($k,$v)=each %map){
        $skip{$k}=1 if grep($_ eq $v, @{$api->{'$d::Sections::non_end_templates'}});
    }

    my $outtmpl={};
    $pre=$api->process_templates($pre, sub {
        my ($name, $params, $wikitext) = @_;
        return undef if $name=~/^#tag:\s*ref$/is;
        return undef if exists($skip{"Template:$name"});

        $wikitext=_end_content_unstrip_templates($wikitext,$outtmpl);
        my $tmp = $wikitext;
        utf8::encode( $tmp ) if utf8::is_utf8( $tmp );
        my $tag="\x02".sha256_base64($wikitext)."\x03";
        $tag=~tr!+/=!-_!d;
        $outtmpl->{$tag}={ name=>$name, text=>$wikitext };
        return $tag;
    });

    my $comment='<!--(?s:[^-]|-(?!->))*-->';

    my $catlang='\[\[\s*(?i:'.join('|', map("\Q$_\E", keys(%il), @{$ns{14}})).')\s*:(?:[^]]|\][^]])*\]\]';
    my $template='\x02(?:'.join('|', map { substr($_,1,-1) } keys %$outtmpl).')\x03';
    $pre=~s/(?:^|(?<=\n))((?:(?:$comment|$catlang|$template)\s*)*)$//;
    my $post=$1 // '';

    $pre=_end_content_unstrip_templates($pre, $outtmpl);
    $post=_end_content_unstrip_templates($post, $outtmpl);
    return wantarray?($pre, $post):$post;
}

sub _end_content_unstrip_templates {
    my $wikitext=shift;
    my $templ=shift;

    $wikitext=~s!(\x02[a-zA-Z0-9_-]+\x03)! exists($templ->{$1})?$templ->{$1}{'text'}:$1 !gioe;
    return $wikitext;
}

1;

=pod

=back

=head1 COPYRIGHT

Copyright 2008–2013 Anomie

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.