// 2.0 - 2022 February 17
function checkentry(url) {
const inquery2 = "SELECT * FROM web WHERE url=$1"
var insertarray = [url]
try {
var res = sql.query(inquery2, insertarray)
if (res.rowCount != 1) { // duplicate
return null
}
var retstruct = {
url: res.rows[0].url,
title: res.rows[0].title,
isdead: res.rows[0].isdead,
work: res.rows[0].work,
metatitle:res.rows[0].metatitle,
shouldnotplace: res.rows[0].shouldnotplace
}
if (retstruct.shouldnotplace) {
// dup titles, etc... detected by grabber script
return null
}
return retstruct
} catch(e) {
return null
}
}
function checktitle(str) {
var badones = [
"error",
"not found",
"sorry",
"cookies",
"404",
"410",
"just a moment",
"unavailable",
"not available",
"untitled",
"web server is down",
"wayback machine",
"archive.",
"attention required",
"paywall",
"503",
"too many requests",
"under construction",
"hugedomains",
"godaddy",
"are you a robot",
"loading...",
"account suspended",
"domain for sale",
"access denied",
"browser settings",
"suspended",
"unsupported",
"down for maintainence",
"captcha"
]
for (unsuitables of badones) {
if (str.toLowerCase().indexOf(unsuitables) >= 0) {
return true
}
}
return false
}
function regexp_quote(str) {
return str.replace(/([.?*+^$[\]\\(){}-])/g, "\\$1");
}
function traverse(refitem, datefmt = "") {
var traversedcount = -1
var removebaretemp = false // tracking category for multiple bare refs
for (refobj of refitem) { // iterate over parser "objects" in the <ref></ref> in question
traversedcount = traversedcount + 1 // count of objects traversed.
if (typeof refobj == "string") {
// This is a recursive function, so sometimes it calls a function on a string
// A string can not be iterated and if the object passed in is a string it has gone too deep in, so step out.
return
}
if (refobj.type == "url" && refobj.is_bare == true ) {
usethisurl = refobj[0].toString()
if (usethisurl.indexOf("archive.") >= 0 || // everything else (note the . at the end)
usethisurl.indexOf("webcit") >= 0 || // webcite
usethisurl.indexOf("youtube.com") >= 0 ||
usethisurl.indexOf("twitter.com") >= 0 ||
usethisurl.indexOf("facebook.com") >= 0 ||
usethisurl.indexOf("instagram.com") >= 0) {
// Skip these, because these shoud either be in archive-url (out of scope) or I haven't integrated the fixes for these yet
continue
}
var shoulddo = true
for (refobj2 of refitem) { // iterate through the whole thing again to check for undeseriables
if (typeof refobj2 == "string" && refobj2.trim() != "") {
shoulddo = false // lets not fix middle ones. For exaple <ref>https://website.website is an amazing site</ref> is not something that should be filled
break
}
if (refobj2.type == "transclusion" && refobj2.name.toLowerCase() != "bare url inline") {
// If there is some sort of transcluion in the <ref></ref> that is not recognized, skip as it might be out of scope.
shoulddo = false
break
}
}
if (!shoulddo) {
continue
}
var parsethis = "{{cite web"
usethisurl = usethisurl.replaceAll("|", "%7C") // escape for CS1
parsethis = parsethis + " |url=" + usethisurl
if (usethisurl.indexOf(".pdf") >=0) {
continue
}
var cached = false
var retstruct = checkentry(usethisurl)
var usethistitle = ""
var usethiswebsite = ""
var placeDead = false
if (retstruct && retstruct.title) {
cached = true
usethistitle = retstruct.title
if (retstruct.isdead) {
placeDead = true
}
} else {
// no match
continue
}
if (retstruct.metatitle && retstruct.work && !placeDead ) {
// This will handle some of the splicing
// Sometimes the "metatitle" will have the name of the work, in this case it will remove it.
// Note that the "work" field is confirmed to be the website name, so it can remove this off the splice if its there
// Contrast this to just removing any and all splices where we don't know what comes after the splice.
retstruct.metatitle = retstruct.metatitle.replaceAll("|", "{{!}}")
retstruct.work = retstruct.work.replaceAll("|", "{{!}}")
var metatitle_lcase = retstruct.metatitle.toLowerCase()
var work_lcase = retstruct.work.toLowerCase()
if (retstruct
&& (retstruct.metatitle && retstruct.metatitle.trim() != "")
&& (retstruct.work && retstruct.work.trim() != "")
&& metatitle_lcase != work_lcase
&& work_lcase != usethistitle.toLowerCase()
&& (metatitle_lcase != usethistitle.toLowerCase() || metatitle_lcase.indexOf(work_lcase) > 0)
&& work_lcase.indexOf(metatitle_lcase) < 0 ) {
// Once website name is determined, strip it out of title and place it in website field per request
if (metatitle_lcase.indexOf(work_lcase) > 0) {
// if website name is in title, strip it out (equiv: IF articleTitle INCLUDES foundWebsiteName)
var regstr = "[»|–—-]+\\s+" + regexp_quote(retstruct.work) + "$"
var regobj = new RegExp(regstr)
retstruct.metatitle = retstruct.metatitle.replace(regobj, "")
if (retstruct.metatitle.toLowerCase() != metatitle_lcase && retstruct.metatitle.trim() != "") {
// set website, otherwise move on
//equiv: "trimmedArticleTitle IS NOT BLANK OR CRAP"
usethistitle = retstruct.metatitle
usethiswebsite = retstruct.work
}
} else {
usethistitle = retstruct.metatitle
usethiswebsite = retstruct.work
}
} else {
// We couldn't find the website name, or couldn't extract and remove it. The website name may be in the title however, the bot may not be able to get to it. Per request, always put domain name, redundancy is always better.
usethiswebsite = usethisurl.parse().hostname
}
if (usethistitle.length > 75 || usethiswebsite.length > 35) {
// // Some malformed websites have absurdly long titles. Don't fill these in.
continue
}
if (checktitle(usethistitle)) {
// if bad title, continue and don't fill. We don't know if dead, just that something is wrong, leave it alone
continue
}
if (usethisurl.indexOf(usethistitle.toLowerCase()) >= 0) {
// If the title is in the URL (Example: title is Wikipedia.Com and website is wikipedia.com/fsdfdfsdf), then don't fill
continue
}
if (usethistitle && usethistitle != "" && !placeDead ) {
usethistitle = usethistitle.replaceAll("|", "{{!}}")
// This escapes pipe parameters in titles
unicoderemove = usethistitle.replaceAll(/[\u00a0\u00ad\ufffd\u200a\u200b\u200d\u0009\u0010\u0013\u007f\u0000-\u001f\u0080-\u0094]/g, " ")
// prevent CS1 errors with certain blacklisted unicode characters.
if (usethistitle != unicoderemove) {
continue // if replaced, then lets avoid
}
unicoderemove_web = usethiswebsite.replaceAll(/[\u00a0\u00ad\ufffd\u200a\u200b\u200d\u0009\u0010\u0013\u007f\u0000-\u001f\u0080-\u0094]/g, " ")
if (usethiswebsite != unicoderemove_web) {
continue // if replaced, then lets avoid
}
if (usethiswebsite && usethiswebsite != "") {
parsethis = parsethis + " |title=" + usethistitle.trim() + " |website=" + usethiswebsite.trim()
// If the retreived "website=" parameter is valid, then fill it in
} else {
parsethis = parsethis + " |title=" + usethistitle.trim()
}
} else (placeDead) {
parsethis2 = " {{Dead link|bot=BareRefBot|date=February 2022}}" // note space
var parsethis2t = CeL.net.wiki.parser(parsethis2).parse()
obj.push(parsethis2t)
continue
}
parsethis = parsethis + "}}"
var parsedt = CeL.net.wiki.parser(parsethis).parse()
obj[traversedcount] = parsedt
console.log("done with " + usethistitle)
removebaretemp = true
}
if (refobj.type == "external_link") {
continue
}
if (refobj.type == "transclusion" && refobj.name.toLowerCase() == "bare url inline" && removebaretemp) {
delete obj[traversedcount]
removebaretemp = false
}
if (obj.type == "tag_inner") {
traverse(obj[traversedcount]) // Deal with nested refs, and other parser strangeness.
}
}
}
function main(filename) {
var wikitxt = fs.readFileSync(filename).toString()
var page_data = CeL.net.wiki.parser(wikitxt)
parsed_data = page_data.parse()
parsed_data.each("tag_inner", function refprocess(token, index, parent) {
if (!parent || parent.tag != "ref") {
// we dont want to convert non ref bares (e.g.: URLS out of nowhere and external link sections)
return
}
gst = traverse(token, datetype)
})
console.log("done")
var writeto = parsed_data.toString()
if (!writeto || writeto.trim().length == 0 || writeto.length < wikitxt.length) {
// Should never be less, sanity check to prevent blanking, etc....
// Reset to orig and reutrn
writeto = wikitxt
return
}
writeFile(filename, writeto)
}