Wikipedia:Scripts/Perl scripts/hdump-head.pl

Script reads page metadata to attempt to generate a cite journal or PubMed search. Intended to be eventually integrated with the Template Filler, the example finds both PMID and DOI.

$ perl hdump-head.pl http://www.jlr.org/content/48/9/1936.long
#    http://www.jlr.org/content/48/9/1936.long
http://www.jlr.org/content/48/9/1936.long
<title> @0.0.1
  "Novel synthetic ceramide derivatives increase intracellular calci..."
citation_lastpage	1943
dc.language	en
citation_volume	48
citation_journal_abbrev	J. Lipid Res.
...
dc.publisher	American Society for Biochemistry and Molecular Biology
citation_abstract_html_url	http://www.jlr.org/content/48/9/1936.abstract
citation_pdf_url	http://www.jlr.org/content/48/9/1936.full.pdf
citation_issn	0022-2275
citation_issn	1539-7262
dc.date	2007-09-01
citation_section	Research Article
citation_id_from_sass_path	48/9/1936
robots	noarchive,nofollow

|author1=Yoo Bin Kwon |author2=Chang Deok Kim |author3=Jong-Kyung Youm |author4=Hyung Sub Gwak |author5=Byeong Deog Park \
|author6=Seung Hun Lee |author7=Saewha Jeon |date=09/01/2007 \
|title=Novel synthetic ceramide derivatives increase intracellular calcium levels and promote epidermal keratinocyte differentiation \
|journal=J. Lipid Res. |volume=48 |issue=9 |pages=1936–1943 |pmid=17554144 |doi=10.1194/jlr.M700185-JLR200 \
|url=http://www.jlr.org/content/48/9/1936.full 

09/01/2007[dp] \
Novel synthetic ceramide derivatives increase intracellular calcium levels and promote epidermal keratinocyte differentiation[ti] \
 J. Lipid Res.[ta] 48[vi] 9[ip] 1936–1943[pg] 
# Test driver for reading meta data tags from files or URIs given on the command
# line.
# Tries to match cite journal tags with citation_.* , Dublin Core (dc.*) and
# PRISM (prism.*) meta tags that could provide PMID, DOI, URL values or provide
# values for a CrossRef lookup
# Where no cite journal tag candidates are found, fall back to looking for meta
# tags that provide cite web data.
#
# http://prismstandard.org/namespaces/1.2/basic/
# Publishing Requirements for Industry Standard Metadata
# The PRISM Working Group is open to all IDEAlliance members and includes:
#    Adobe Systems, Inc.,  CMP Media, LLC, 
#    Hachette Filipacchi Media U.S. (HFM US),
#    Hearst Magazines, L A Burman Associates Inc.,
#    LexisNexisTM, a division of Reed Elsevier plc.,
#    The McGraw-Hill Companies, Inc., Reader's Digest, Taxonomy Strategies,
#    The Nature Group, Time Inc. and U.S.News & World Report.
#
# This program is free software;
# you can redistribute it and/or modify it under the same terms as Perl itself.
#
#use strict;
use URI;
use URI::Fetch;
use WWW::Mechanize;
use URI::Split qw(uri_split uri_join);
use HTML::Element;
use HTML::TreeBuilder;

sub abbreviate_journal($$)
{
    my($jname, $jissn) = @_;
    if ($jname eq 'Proceedings of the National Academy of Sciences of the United States of America') {
        $jname = 'Proc. Natl. Acad. Sci. U.S.A.';
    }
    elsif ($jname eq 'JNCI Journal of the National Cancer Institute') {
        $jname = 'J Natl Cancer Inst.';
    }
    return $jname;
}

sub splice_in_omim_entries($$)
{
    my($split_arg, $omim_str) = @_;
    my(%omim_nos);

    foreach $omim_str (split($split_arg, $omim_str)) {
	$omim_nos{$omim_str} = $omim_str;
    }
    my(@omim_urls);
    foreach $omim_str (keys(%omim_nos)) {
	#push(@omim_urls, 'http://omim.org/entry/' . $omim_str);
	push(@ARGV, 'http://omim.org/entry/' . $omim_str);
    }
    #print join("\n", @omim_urls);
    #print "\n";
}

my @cj_tag_order =      # Provide order for cite journal output
    qw( author author1 author2 author3 author4 author5 author6 author7
    date month year chapter chapterurl title journal work publisher
    volume issue pages pmid arxiv id isbn doi url);
my @cj_tags = (         # cite journal tag, meta tag list
[qw( author dc.creator DC.creator DC.contributor dc.contributor citation_authors citation_author )],
[qw( title citation_conference citation_title dc.title )],
[qw( chapter citation_title dc.title DC.title )],
[qw( issn citation_issn prism.issn )],
[qw( journal citation_journal_abbrev citation_journal_title prism.publicationname )],
[qw( volume citation_volume prism.volume )],
[qw( issue citation_issue prism.number )],
[qw( pages prism.endingpage citation_lastpage citation_firstpage prism.startingpage )],
[qw( date citation_date prism.publicationdate dc.date DC.date citation_year )],
[qw( pmid citation_pmid )],
[qw( arxiv citation_arxiv_id )],
[qw( url citation_fulltext_html_url citation_pdf_url citation_abstract_html_url )],
[qw( doi citation_doi dc.identifier DC.identifier )],
[qw( language dc.language DC.language )],
[qw( publisher dc.publisher DC.publisher dc.publisher.corporatename citation_publisher )],
[qw( isbn citation_isbn )],
);
fl_or_URI: for my $file_or_URI (@ARGV) {
    my %cite_values;
    my $tree = HTML::TreeBuilder->new;
    my $original_URI;
    print "#    $file_or_URI\n";
    my $head;
    if (-f $file_or_URI) {
        $tree->parse_file($file_or_URI);
        $head = $tree->look_down('_tag' , 'head' );
    }
    else {
        my($uri) = URI->new($file_or_URI);
        my($sch, $aut, $pth, $qry, $frg) = uri_split($uri);
        #print $sch, " ", $aut, " ", $pth, "\n";
        if (defined $sch && $sch eq 'http' && defined $aut && defined $pth) {
            if (substr($pth, -4) eq '.pdf') {
                $original_URI = $file_or_URI;
                next fl_or_URI if defined $qry || defined $frg;
                if (substr($pth, 0, 13) eq '/cgi/reprint/') { # oxfordjournals
                    substr $pth, -4, 4, '';
                    substr $pth, 5, 7, 'content/abstract';
                }
                elsif (substr($pth, 0, 9) eq '/content/'
                && substr($pth, -9) eq '.full.pdf') {   # PNAS
                    substr $pth, -8, 8, 'abstract';
                }
                else {
                    next fl_or_URI if defined $qry || defined $frg;
                }
                $file_or_URI = uri_join($sch, $aut, $pth, $qry, $frg);
            }
	    elsif ($aut eq 'www.ncbi.nlm.nih.gov'
	    && $pth =~ m#/omim/(\d[0-9,]*\d)#) {
		splice_in_omim_entries(',', $1);
		next;
	    }
	    elsif ($aut eq 'www.omim.org' && $pth eq '/search'
	    && $qry =~ m#\bsearch=(\d[0-9+]*\d)\b#) {
		splice_in_omim_entries('\+', $1);
		next;
	    }
            print {STDERR} $file_or_URI, "\n";
            #my $res = URI::Fetch->fetch($file_or_URI);
            my $res = WWW::Mechanize->new()->get($file_or_URI);
            if (!defined $res) {
                print {STDERR} $file_or_URI, "\t", URI::Fetch->errstr, "\n";
                next fl_or_URI;
            }
            elsif (!$res->decode()) {
                print {STDERR} $file_or_URI, "\tdecode failed\n";
                next fl_or_URI;
            }
	    printf "Page title: %s\n", $res->title;
            $tree->parse_content($res->content);
            $head = $tree->look_down('_tag' , 'head' );
        }
    }
    my $title;
    if (defined $head) {
        $title = $head->look_down('_tag', 'title');
        if (!defined $title) {
            print $file_or_URI, "\n";
            $head->dump();
        }
        else {
            $title->dump();
            $title = $title->as_text;
        }
        #print $head, $head->as_text , "\n";
        #$head->dump();
        foreach my $e ($head->look_down('_tag', 'meta')) {
            #print $e, $e->as_text, "\n";
            my $m_name = $e->attr('name');
            #if (defined $m_name && $m_name =~ /^(DC\.|prism\.|citation)/i) 
            if (defined $m_name) {
                my $m_content = $e->attr('content');
                if (defined $m_content && length($m_content) > 0) {
                    $m_name = lc $m_name;
                    if (!defined $cite_values{$m_name}) {
                        $cite_values{$m_name} = $m_content;
                    }
                    elsif (ref($cite_values{$m_name}) eq 'ARRAY') {
                        push @{$cite_values{$m_name}}, $m_content;
                    }
                    else {
                        $cite_values{$m_name} =
                            [$cite_values{$m_name}, $m_content];
                    }
                }
            }
            #$e->dump();
        }
    }
    foreach my $m_name (keys %cite_values) {
        if (ref($cite_values{$m_name}) ne 'ARRAY') {
            printf "%s\t%s\n", $m_name, $cite_values{$m_name};
        }
        else {
            foreach my $m_val (@{$cite_values{$m_name}}) {
                printf "%s\t%s\n", $m_name, $m_val;
            }
        }
    }
    my(%cj_values);
    my($cb_end) = '';
    foreach my $cj_tag_meta (@cj_tags) {
        my $tag = $cj_tag_meta->[0];
        meta_tag_alternatives:
        for my $m_name (@{$cj_tag_meta}[1...$#{$cj_tag_meta}]) {
            #print $tag, " ", $m_name, "\n";
            if (defined $cite_values{$m_name}) {
                #print $tag, " ", $m_name, "\n";
                if ($tag eq 'journal') {
                    $cj_values{$tag} =
                        abbreviate_journal(
                            $cite_values{$m_name}, $cj_values{'issn'});
                    delete $cj_values{'issn'};
                    last meta_tag_alternatives;
                }
                elsif ($tag eq 'pages') {
                    if (defined $cj_values{$tag}) {
                        if ($cite_values{$m_name} ne $cj_values{$tag}) {
                            $cj_values{$tag} =
                                $cite_values{$m_name} . '–' . $cj_values{$tag};
                            last meta_tag_alternatives;
                        }
                    }
                    $cj_values{$tag} = $cite_values{$m_name};
                }
                elsif ($tag eq 'author') {
                    if (ref($cite_values{$m_name}) ne 'ARRAY') {
                        $cj_values{$tag} = $cite_values{$m_name};
                    }
                    else {
                        my($authors) = $cite_values{$m_name};
                        my($max_ac) = scalar @{$authors};
                        for my $ac (0...($max_ac > 6 ? 6 : $max_ac - 1)) {
                            $cj_values{$tag . ($ac + 1)} = $authors->[$ac];
                        }
                    }
                    last meta_tag_alternatives;
                }
                elsif ($tag eq 'doi') {
                    my $doi_candidate = $cite_values{$m_name};
                    $doi_candidate =~ s/^doi://ixms;
                    if (substr($doi_candidate, 0, 3) eq '10.') {
                        $cj_values{$tag} = $doi_candidate;
                    }
                    elsif (!defined $cj_values{'url'}
                    && substr($doi_candidate, 0, 7) eq 'http://') {
                        # Use URI::Parse and allow alternatives?
                        $cj_values{'url'} = $doi_candidate;
                    }
                    last meta_tag_alternatives;
                }
                elsif ($tag eq 'publisher') {
                    if (!defined $cj_values{'journal'}) {
                        $cj_values{$tag} = $cite_values{$m_name}
                    }
                    last meta_tag_alternatives;
                }
                else {
                    if ($tag ne 'language'
                    || $cite_values{$m_name} !~ /^en/ixms) {
                        $cj_values{$tag} = $cite_values{$m_name};
                    }
                    last meta_tag_alternatives;
                }
            }
        }
    }
    if (scalar(keys %cj_values) == 0) {
        {
            my $_;
            for my $tag (qw(cre), grep { /pubname$/xms }  keys %cite_values) {
                next if !defined $cite_values{$tag};
                $cj_values{'work'} = $cite_values{$tag};
                last;
            }
            if (!defined $cj_values{'work'}
            && substr($title, 0, 9) eq 'BBC NEWS ') {
                $cj_values{'work'} = 'BBC News';
            }
        }
        if (defined $cj_values{'work'}) {
            if (defined $cite_values{'dsk'}) {  # NYTimes
                $cj_values{'work'} .= ': ' . $cite_values{'dsk'};
            }
            elsif (defined $cite_values{'section'}) {   # Guardian
                $cj_values{'work'} .= ': ' . $cite_values{'section'};
            }
        }
        {
            my $_;
            for my $tag
                (qw(hdl headline), grep { /title$/xms } keys %cite_values)
            {
                next if !defined $cite_values{$tag};
                $cj_values{'title'} = $cite_values{$tag};
                last;
            }
            if (!defined $cj_values{'title'}) {
                $cj_values{'title'} = $title;
            }
        }
        {
            my $_;
            for my $tag (grep { /date\b/xms } keys %cite_values) {
                next if !defined $cite_values{$tag};
                $cj_values{'date'} = $cite_values{$tag};
                last;
            }
        }
        if (defined $cite_values{'byl'}) {      # NYTimes
            my $byl = $cite_values{'byl'};
            if (substr($byl, 0, 3) eq 'By ') {
                $byl = substr $byl, 3
            }
            $cj_values{'author'} = $byl;
        }
        else {
            my $_;
            for my $tag (grep { /authors?$/xms } keys %cite_values) {
                next if !defined $cite_values{$tag};
                $cj_values{'author'} = $cite_values{$tag};
                last;
            }
        }
    }
    print "\n";
    if (defined $cj_values{'chapter'}) {
	if (!defined $cj_values{'isbn'} || !defined $cj_values{'title'}) {
	    $cj_values{'chapter'} = undef;
	}
	else {
	    $cj_values{'chapterurl'} = $cj_values{'url'};
	    $cj_values{'url'} = undef;
	}
    }
    if ($cj_values{'title'} =~ /^ OMIM Entry - [#*] (\d+) - (.*) $/) {
	my($omim_no, $omim_desc) = ($1, $2);
	$omim_desc =~ s/\bTYPE\b/Type/g;
	$omim_desc =~ s/\bDISEASE\b/Disease/g;
	$omim_desc =~ s/\bSYNDROME\b/Syndrome/g;
	print '{{OMIM|', $omim_no, '|', $omim_desc, "}}\n";
	next;
    }
    if (defined $cite_values{'ncbi_db'} && $cite_values{'ncbi_db'} eq 'books') {
	$cj_values{'id'} = $cite_values{'ncbi_acc'};
	if (defined $cite_values{'ncbi_domain'}
	&& $cite_values{'ncbi_domain'} eq 'gene') {
	    $cj_values{'publisher'} = undef;
	    $cb_end = "}} In {{cite book |editor=Pagon RA, Bird TD, Dolan CR, ''et al.'' |title=GeneReviews™ &#x05B;Internet&#x05D; |year=1993– |publisher=University of Washington, Seattle |location=Seattle WA |url=http://www.ncbi.nlm.nih.gov/books/n/gene/TOC/ |ref={{harvid|GeneReviews}}}}";
	}
	elsif (defined $cite_values{'ncbi_pdid'}
	&& $cite_values{'ncbi_pdid'} eq 'book-part') {
	
	    $cj_values{'chapter'} = $cj_values{'title'};
	    $cj_values{'chapter'} = undef;
	    $cj_values{'chapterurl'} = $cj_values{'url'};
	    $cj_values{'url'} = undef;
	    $cb_end = '}}';
	}
	else {
	    $cb_end = '}}';
	}
	print '{{cite book ';
    }
    for my $m_name (@cj_tag_order) {
        if (defined $cj_values{$m_name}) {
            printf '|%s=%s ', $m_name, $cj_values{$m_name};
        }
    }
    print $cb_end, "\n";
    # These append type suffixes for Entrez PubMed search terms
    # http://www.ncbi.nlm.nih.gov/pubmed
    my %pubmed_tag =
    (
        'title' => 'ti',
        'author' => 'au',
        'issue' => 'ip',
        'year' => 'dp',
        'date' => 'dp',
        'volume' => 'vi',
        'journal' => 'ta',
        'pages' => 'pg',
    );
    if (!defined $cj_values{'pmid'} && !defined $cj_values{'isbn'})  {
	for my $m_name (@cj_tag_order) {
	    if (defined $cj_values{$m_name} && defined $pubmed_tag{$m_name}) {
		printf '%s[%s] ', $cj_values{$m_name}, $pubmed_tag{$m_name};
	    }
	}
    }
    print "\n";
    #$tree->dump();
    $tree->delete;
}