Wikipedia:Bots/Requests for approval/BHGbot 7/Make-BHGbot7-edit-list.sh

#!/bin/bash
# Make-BHGbot7-edit-list.sh
#
# This script creates a list of category-space pages to be created as category redirects for [[WP:BHGbot 7]]
# Each entry in the list is the name of a cateory page to be created as a redirect to the same title,
# but with the "z" in "organization" replaced with "s", and vice versa.
#
# e.g. the page [[Category:Anti-Foobar organisations]] is to be created with
#    the content {{Category redirect|Anti-Foobar organizations|bot=BHGbot}}
#
# Three input files are required.  Each is the outut of a quarry query, saved in tab-delimited format.
# i)   "quarry-orgcats": the output of https://quarry.wmflabs.org/query/46899
# ii)  "quarry-allcats": the output of https://quarry.wmflabs.org/query/46999
# iii) "quarry-all-articles": the output of https://quarry.wmflabs.org/query/47001
#
#
# There are 5 steps:
# 1. Convert the list in quarry-orgcats by replacing s with z and vice-versa: output in fixed-orgcats
# 2. Safety check: remove any entries in fixed-orgcats which do not contain "organi[sz]ation" 
# 3. Prepare the data files for comparison
# 4. Remove from fixed-orgcats:
#    a) existing category pages
#    b) titles which exist in article space
# 5. Convert the list into wikilinked format for use by AWB
#
#
# ==============================
# STEP 1:
# in quarry-orgcats, replace every instance of "organisation" with "organization", and vice-versa
# save the output in fixed-orgcats
#
sed -e 's/_/ /g' < quarry-orgcats | \
    sed -E "s/\b([oO]rgani)s(ations?)\b/\1%%%@#!%%%\2/g" | \
    sed -E "s/\b([oO]rgani)z(ations?)\b/\1s\2/g" | \
    sed -E "s/\b([oO]rgani)%%%@#!%%%(ations?)\b/\1z\2/g" | \
    sed -e 's/ /_/g' \
    > fixed-orgcats
echo -n "created fixed-orgcats. #Lines: "
wc -l  fixed-orgcats
#
#
# ==============================
# STEP 2:
# Safety check: purge from fixed-orgcats any lines which do not contain "organi[sz]ation"
# This should remove only one line: the "page title" header from the quarry output.
# Any more removals indicates an error in the input data
#
echo -n "purging from fixed-orgcats any lines which do not contain organi[sz]ation: "
grep -P '[oO]rgani[sz]ations?' < fixed-orgcats > fixed-orgcats-purged
echo -n "DONE. #Lines: "
wc -l  fixed-orgcats-purged
#
#
# ==============================
# STEP 3:
# Prepare each of the data files 
# Each file needs to be:
# a) converted to unix format by stripping out the CR from the CR-LF pairs.
# b) sorted alphabetically to allow use of comm to compare files
#
echo ""
echo -n "sorting fixed-orgcats-purged: "
sort < fixed-orgcats-purged | tr -d '\015' > fixed-orgcats-sorted
echo -n "DONE. #lines: "
wc -l fixed-orgcats-sorted
echo -n "sorting quarry-allcats: "
sort < quarry-allcats | tr -d '\015' > quarry-allcats-sorted
echo -n "DONE. #lines: "
wc -l quarry-allcats-sorted
echo -n "sorting quarry-all-articles (may be slow): "
sort < quarry-all-articles | tr -d '\015' > quarry-all-articles-sorted
echo -n "DONE. #lines: "
wc -l  quarry-all-articles-sorted
#
#
# ==============================
# STEP 4
# Compare the lists to remove entries which should not be created
#
echo ""
echo -n "Removing existing category pages from the list of redirects to be created: "
comm -23 fixed-orgcats-sorted quarry-allcats-sorted  > fixed-orgcats-notexist
echo -n "DONE. #lines: "
wc -l fixed-orgcats-notexist
echo -n "Removing existing aricle titles from the list of categ redirects to be created: "
comm -23 fixed-orgcats-notexist quarry-all-articles-sorted > redirect-cats-to-create-bare
echo -n "DONE. #lines: "
wc -l redirect-cats-to-create-bare
#
#
# ==============================
# STEP 5
# Convert the list into wikilinked format for use by AWB
#
echo ""
echo -n "Wikilink the list of redirects to be created: "
sed -E 's/^/# [[:Category:/g'  < redirect-cats-to-create-bare | sed -e 's/$/]]/g' > redirect-cats-to-create.txt
echo "DONE"
echo -e "\n\n\n===== FINISHED ====="
echo "Stats:"
echo -n "Existing non-redirected, non-dab cats with organi[sz]ation in title: "
wc -l fixed-orgcats-purged
echo -n "Proposed redirects which don't already exist as cats: "
wc -l fixed-orgcats-notexist
echo -n "Proposed redirects which don't already exist as cats or as article titles: "
wc -l redirect-cats-to-create-bare
echo -e "\nList of redirects to created is at redirect-cats-to-create.txt"
echo -n "Number of redirects to create: "
wc -l redirect-cats-to-create.txt