User:GreenC bot/Job 14/source

Note: this is outdated but gives a general view how it works.

#!/usr/bin/gawk -bE

#
# popbot  - a bot to add {{tld|<country_name> metadata Wikidata}} to infoboxes
#           Home: https://en.wikipedia.org/wiki/User:GreenC_bot/Job_14
#           Dependencies: BotWikiAwk (GitHub)
#

# The MIT License (MIT)
#
# Copyright (c) April 2019
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

BEGIN {
  BotName = "popbot"
}

@include "botwiki.awk"
@include "library.awk"

BEGIN {

  Mode = "bot"   # set to "find" and it will search only and exit with a 1 (found something) or 0 (found nothing)
                 #  in "find" mode, run via 'project -s' to search local cache for articles containing actionable matches
                 # set to anything else and it will process the article.

  IGNORECASE = 1
  ReSpace = "[\n\r\t]*[ ]*[\n\r\t]*[ ]*[\n\r\t]*"

  Country = "Spain"
  Template = Country " metadata Wikidata"
  ADDAREA = 0   # Set to "1" to add area + population fields. Set to "0" for population fields only

  Optind = Opterr = 1
  while ((C = getopt(ARGC, ARGV, "hs:l:n:")) != -1) {
      opts++
      if(C == "s")                 #  -s <file>      article.txt source to process.
        articlename = verifyval(Optarg)
      if(C == "l")                 #  -l <dir/>      Directory where logging is sent.. end with "/"
        logdir = verifyval(Optarg)
      if(C == "n")                 #  -n <name>      Wikipedia name of article
        wikiname = verifyval(Optarg)
      if(C == "h") {
        usage()
        exit
      }
  }

  if( ! opts || articlename == "" ) {
    stdErr("Error in popbot.awk (1)")
    print "0"
    exit
  }

  if(wikiname == "" || logdir == "")
    Logfile = "/dev/null"
  else {
    if(substr(logdir, length(logdir), 1) != "/")
      logdir = logdir "/"
    Logfile = logdir "logpopbot"
  }

  Count = 0
  main()

}

function main(  article,articlenew,articlenewname,editsummaryname,bn) {

  checkexists(articlename, "popbot.awk main()", "exit")
  article = readfile(articlename)
  if(length(article) < 10) {
    print "0"
    exit
  }

  articlenew = popbot(article)

  if(article != articlenew && length(articlenew) > 10 && Count > 0) {

    articlenewname = editsummaryname = articlename

    bn = basename(articlename) "$"

    gsub(bn, "article.popbot.txt", articlenewname)
    printf("%s", articlenew) > articlenewname
    close(articlenewname)

    gsub(bn, "editsummary.popbot.txt", editsummaryname)

    printf("Add {{[[Template:" Template "|" Template "]]}} (via [[User:GreenC bot/Job 14|popbot]])", Count) > editsummaryname  # Customize the edit summary to be more specific
    close(editsummaryname)

    print Count
    exit

  }
  print "0"
  exit

}

#
# popbot - main function
#
#   . extract templates in article and do something to each. Return modified article.
#
function popbot(article,  i,a,dest,G,k,point_area,point_pop,c,re,z,N,fp) {

  re = "[{]{2}" ReSpace "Infobox settlement"
  if(article !~ re) {
    print wikiname >> logdir "lognobox"
    return article
  }

  # population_total needed to orient where to insert fields. Skip and log if missing.

  if(article !~ /[|][ ]*population_total[ ]*[=][ ]*/) {
    print wikiname >> logdir "lognopop"
    return article
  }

  delete G

  # Existing fields default values

  G["population_total"] =     "| population_total = {{" Template "|population_total}}"
  G["population_as_of"] =     "| population_as_of = {{" Template "|population_as_of}}"
  G["population_footnotes"] = "| population_footnotes = {{" Template "|population_footnotes}}"
  if(ADDAREA) {
    G["area_footnotes"] =       "| area_footnotes   = {{" Template "|area_footnotes}}"
    G["area_total_km2"] =       "| area_total_km2   = {{" Template "|area_total_km2}}"
  }

  # Existing fields actual values (if they exist)

  for(i = 1; i <= splitn(article, a, i); i++) {

    if(match(a[i], /^[ ]*[|][ ]*population_total[ ]*[=][ ]*[^$]*[^$]/, dest))
      G["population_total"] = dest[0]
    else if(match(a[i], /^[ ]*[|][ ]*population_as_of[ ]*[=][ ]*[^$]*[^$]/, dest))
      G["population_as_of"] = dest[0]
    else if(match(a[i], /^[ ]*[|][ ]*population_footnotes[ ]*[=][ ]*[^$]*[^$]/, dest))
      G["population_footnotes"] = dest[0]

    else if(ADDAREA && match(a[i], /^[ ]*[|][ ]*area_footnotes[ ]*[=][ ]*[^$]*[^$]/, dest))
      G["area_footnotes"] = dest[0]
    else if(ADDAREA && match(a[i], /^[ ]*[|][ ]*area_total_km2[ ]*[=][ ]*[^$]*[^$]/, dest))
      G["area_total_km2"] = dest[0]

  }

  # New fields values

  PROCINFO["sorted_in"] = "@ind_str_asc"
  for(k in G) {
    if(G[k] !~ Template) {
      N[k] = subs(substr(G[k], index(G[k], "=") + 1, length(G[k])), "", G[k])
      N[k] = N[k] " {{" Template "|" k "}}"
      N[k] = subs(k, k "2", N[k])
    }
    else {
      N[k] = G[k]
      N[k] = subs(k, k "2", N[k])
    }
  }

  i = split(article, a, "\n")

  # Find location of population_total

  re = "^[ ]*[|][ ]*population_total[ ]*[=][ ]*"
  point_pop = i
  for(c = 1; c <= i; c++) {
    if(a[c] ~ re) {
      point_pop = c
    }
  }
  if(point_pop >= i) {
    print wikiname >> logdir "lognopop"
    return article
  }

  # Find location of area_metro_km2

  if(ADDAREA) {
    re = "^[ ]*[|][ ]*area_metro_km2[ ]*[=][ ]*"
    point_area = i
    for(c = 1; c <= i; c++) {
      if(a[c] ~ re) {
        point_area = c
      }
    }
    if(point_area >= i)
      point_area = 0
  }


# rebuild article with new fields in correct location within infobox

  # Add population and area fields

  if(ADDAREA) {
    for(c = 1; c <= i; c++) {
      if(c == point_pop) {
        if(point_area == 0) {      # No area_metro_km2, add all fields together
          for(z in N)
            fp = fp "\n" N[z]
        }
        else {
          for(z in N) {            # area_metro_km2 exists, add only the population fields
            if(z ~ /population/)
              fp = fp "\n" N[z]
          }
        }
        fp = fp "\n" a[c]
      }
      else if(c == point_area) {   # area_metro_km2 exists, add only the area fields
        for(z in N) {
          if(z ~ /area/)
            fp = fp "\n" N[z]
        }
        fp = fp "\n" a[c]
      }
      else if(c == 1)              # first line, don't add extra \n
        fp = a[1]
      else
        fp = fp "\n" a[c]
    }
  }

  # Population only, no area fields

  else {
    for(c = 1; c <= i; c++) {
      if(c == point_pop) {
        for(z in N)
          fp = fp "\n" N[z]
      }
      else if(c == 1)              # first line, don't add extra \n
        fp = a[1]
      else
        fp = fp "\n" a[c]
    }
  }

  # delete the original fields

  for(z in G)
    fp = subs(G[z] "\n", "", fp)

  # remove the trailing "2" from new fields

  for(z in G)
    fp = subs(z "2", z, fp)

  # print fp > "o"

  Count++
  article = fp
  return article

}