- !/usr/bin/env python3
#
#<nowiki>
"""Outputs a list of articles from a given dump file containing
"citation" or "cite [anything]" templates with duplicate values in
publisher and either encyclopedia, journal, magazine, newspaper,
series, title, website, or work parameters, or in journal and series
parameters.
For [[WP:RAQ#Find all instances of journal=publisher]] circa 28 June
2023.
"""
import sys
assert sys.version_info >= (3, 6), f"requires Python 3.6 or newer"
import argparse
import bz2
import os
import re
import shutil
import textwrap
import xml.sax
import mwparserfromhell
#############
# Arguments #
#############
_all_namespaces = False
HELP_ALL_NAMESPACES = "parse pages in all namespaces, not just article"
_count = None
HELP_COUNT = ("""output a running count of matched pages to stderr,
updating every thousand pages read""")
_output = None
HELP_OUTPUT = "output file, a list of page titles; defaults to stdout"
_print_matches = False
HELP_PRINT_MATCHES = ("""output the page name, a tab, and the names of
the first set of matching template parameters
instead of just the page name""")
#################
# Other globals #
#################
# _rx_rough_match is used to eliminate pages from consideration before
# the expensive full parse; it's important that it have no false
# negatives.
_rx_rough_match = re.compile(r"{{\s*[cC]it(?:ation\b|e ).*(publisher|series)")
# target template name
_rx_template_name = re.compile(r"^[cC]it(?:ation$|e )")
_namespaces = {} # maps namespace numbers to names
_matched_pages = 0 # count of pages w/at least one duplicate param pair
class _XMLHandler(xml.sax.ContentHandler):
def __init__(self):
super().__init__()
self.ns = None
self.title = None
self.text = None
self.tags = [None]
self.namespace = None
self.namespace_key = None
def startElement(self, name, attrs):
if name == "page":
self.ns = None
self.title = None
self.text = None
# These shouldn't be present in <page> tags anyway, but.
self.namespace = None
self.namespace_key = None
elif name == "ns":
self.ns = ""
elif name == "title":
self.title = ""
elif name == "text":
self.text = ""
elif name == "namespace":
self.namespace = ""
self.namespace_key = int(attrs.get("key"))
else:
return
self.tags.append(name)
def endElement(self, name):
if name == self.tags[-1]:
self.tags.pop()
if ((name == "page" and self.text is not None
and self.ns is not None and self.title is not None)):
process_page(int(self.ns), self.title, self.text)
elif name == "namespace" and self.namespace_key is not None:
_namespaces[self.namespace_key] = self.namespace + ":"
def characters(self, content):
if self.tags[-1] == "ns":
self.ns += content
elif self.tags[-1] == "title":
self.title += content
elif self.tags[-1] == "text":
self.text += content
elif self.tags[-1] == "namespace":
self.namespace += content
def pagename(ns, title):
"""Return human-readable name of page title in numbered namespace ns"""
if ns == 0: # Special-case to omit the :
return title
elif ns in _namespaces:
return _namespaces[ns] + ":" + title
else:
return "{{ns:" + str(ns) + "}}:" + title
def process_page(ns, title, text):
"""Filter ns:title (containing plaintext text) by namespace and
_rx_rough_match, pass it through to has_dupe_cite_params() if
appropriate, increment counters, and output
"""
global _count, _matched_pages
if (((_all_namespaces or ns == 0)
and _rx_rough_match.search(text))):
dupe = has_dupe_cite_params(text)
if dupe is not None:
_matched_pages += 1
if _print_matches:
print(pagename(ns, title) + "\t" + dupe, file=_output)
else:
print(pagename(ns, title), file=_output)
if _count is not None:
_count += 1
if _count % 1000 == 0:
print(f"Read {_count} pages, matched {_matched_pages}",
file=sys.stderr)
def has_dupe_cite_params(text):
"""If text contains a citation template with duplicate parameters
we're looking for, return a string suitable for the print-matches
option; else None
"""
def errval(template, param1name, param2name, paramval):
"""Return a string suitable for the print-matches option"""
return ("{{" + str(template.name).strip() + "}}:" + param1name + ","
+ param2name + '="' + paramval + '"')
def param(template, param_name):
"""Return the wikicode of template's parameter param_name as a
str, or None if empty or not present
"""
par = template.get(param_name, default=None)
if par is None:
return None
rval = str(par.value).strip()
if rval == "":
return None
return rval
parsed = mwparserfromhell.parse(text)
templates = parsed.filter_templates()
for t in templates:
if _rx_template_name.match(str(t.name)):
publisher = param(t, "publisher")
if publisher is not None:
for other in ("encyclopedia",
"journal",
"magazine",
"newspaper",
"series",
"title",
"website",
"work"):
if publisher == param(t, other):
return errval(t, "publisher", other, publisher)
journal = param(t, "journal")
if journal is not None and journal == param(t, "series"):
return errval(t, "journal", "series", journal)
return None
def _fill_paragraphs(text, width=None):
"""Returns text, wrapped as per textwrap.fill(), but preserve
paragraph splits (as denoted by sequences of two newlines).
"""
# width is pulled from argparse.HelpFormatter().__init__() to try
# to match the default behavior - and hence option formatting - as
# closely as practical. Irritatingly, it changed in 3.8, which I
# happened to notice by accident.
#
# It is infuriating that argparse neither publicizes its formatter
# classes so they can be properly overridden, nor exposes width
# determination so they can be reliably mimicked. Oh well, if it
# changes again, it's ok if *this* looks a little ugly, and it'll
# break less badly than subclassing the private classes would.
if width is None:
if sys.version_info >= (3, 8):
width = shutil.get_terminal_size().columns
else:
try:
width = int(os.environ['COLUMNS'])
except (KeyError, ValueError):
width = 80
width -= 2
return "\n\n".join([textwrap.fill(s, width) for s in text.split("\n\n")])
def _main():
args = argparse.ArgumentParser(description=_fill_paragraphs(__doc__),
# pylint: disable=bad-continuation
formatter_class=argparse.RawDescriptionHelpFormatter)
args.add_argument("dumpfile",
help="input dump file, in xml or bzip2-compressed xml")
args.add_argument("-a", "--all-namespaces",
action="store_true",
help=HELP_ALL_NAMESPACES)
args.add_argument("-c", "--count",
action="store_true",
help=HELP_COUNT)
args.add_argument("-m", "--print-matches",
action="store_true",
help=HELP_PRINT_MATCHES)
args.add_argument("-o", "--output",
default=sys.stdout,
type=argparse.FileType("w", encoding="utf-8"),
help=HELP_OUTPUT)
args = args.parse_args()
global _all_namespaces, _count, _output, _matched_pages, _print_matches
_all_namespaces = args.all_namespaces
_count = 0 if args.count else None
_print_matches = args.print_matches
_output = args.output
_matched_pages = 0
with open(args.dumpfile, 'rb') as f:
magic = f.read(3)
if magic == b'\x42\x5a\x68':
f = bz2.BZ2File(args.dumpfile)
else:
f = open(args.dumpfile, 'r', encoding='utf-8')
xml.sax.parse(f, _XMLHandler())
# don't print this if count's divisible by 1000 and > 0, since it
# would duplicate the print in process_page()
if _count is not None and (_count == 0 or _count % 1000 != 0):
print(f"Read {_count} pages, matched {_matched_pages}",
file=sys.stderr)
if __name__ == "__main__":
_main()
#</nowiki>