User:Harej/citation-watchlist.js

Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/* Per-wiki configuration */

const LANGUAGE = 'en';
const FAMILY = 'wikipedia';
const endpoint = `https://${LANGUAGE}.${FAMILY}.org/w/api.php`;
const restApiEndpoint = `https://api.wikimedia.org/core/v1`;
const publicSuffixList = "Wikipedia:Citation_Watchlist/Public_Suffix_List";
const listOfLists = "Wikipedia:Citation_Watchlist/Lists";
const msgWarning = "Warning";
const msgCaution = "Caution";
const msgInspect = "Inspect";
const warnEmoji = '\u2757';
const cautionEmoji = '\u270B';
const inspectEmoji = '\uD83D\uDD0E';
const warnSectionHeader = "==Warn==";
const cautionSectionHeader = "==Caution==";
const inspectSectionHeader = "==Inspect==";


/*
Citation Watchlist Script – Highlights watchlist entries when questionable sources are added

author:  Hacks/Hackers
license: GPL 4.0
*/

let publicSuffixSet = new Set();
let warnList = new Set();
let cautionList = new Set();
let inspectList = new Set();

function prependEmojiWithTooltip(element, emoji, domains, tooltipText) {
  let processedType = '';
  if (emoji === warnEmoji) {
    processedType = 'warn';
  } else if (emoji === cautionEmoji) {
    processedType = 'caution';
  } else if (emoji === inspectEmoji) {
    processedType = 'inspect';
  } else {
    console.error('Unsupported emoji type');
    return;
  }

  if (element.getAttribute(`data-processed-${processedType}`) === 'true') {
    return;
  }

  const emojiSpan = document.createElement('span');
  emojiSpan.textContent = emoji + " ";
  emojiSpan.title = tooltipText + ": " + domains.join(", ");
  element.parentNode.insertBefore(emojiSpan, element);
  element.setAttribute(`data-processed-${processedType}`, 'true');
}

async function parseWatchlist() {
  // Select all containers of the watchlist links to process them individually
  const entriesContainers = document.querySelectorAll('.mw-changeslist-links');
  const revisions = [];

  let linkCounter = 0;

  for (const container of entriesContainers) {
    // Within each container, try to find a "diff" link first
    const diffLink = container.querySelector('a.mw-changeslist-diff');
    const histLink = container.querySelector('a.mw-changeslist-history');
    const prevLink = container.querySelector('a.mw-history-histlinks-previous');
    const curLink = container.querySelector('a.mw-history-histlinks-current');

    console.log(`
    Diff link: ${diffLink}
    Hist link: ${histLink}
    Prev link: ${prevLink}
    Cur link: ${curLink}
    `);

    let urlParams = '';

    if (diffLink) {
      linkCounter += 1;
      // If a "diff" link is found, process it
      urlParams = new URLSearchParams(diffLink.href);
      const diffId = await fetchPreviousRevisionId(urlParams.get('diff'));
      revisions.push({
        oldrevision: urlParams.get('diff'),
        newrevision: diffId,
        element: diffLink.parentNode.parentNode
      });
    } else if (histLink) {
      linkCounter += 1;
      // If no "diff" link is found but a "hist" link is, process the "hist" link
      urlParams = new URLSearchParams(histLink.href);
      const pageID = urlParams.get('curid');
      const firstID = await fetchFirstRevisionId(pageID);
      revisions.push({
        oldrevision: firstID,
        element: histLink.parentNode.parentNode
      });
      // At this point, check if we are on a page history rather than watchlist
    } else if (prevLink) {
      linkCounter += 1;
      urlParams = new URLSearchParams(prevLink.href);
      const diffId = await fetchPreviousRevisionId(urlParams.get('oldid'));
      revisions.push({
        oldrevision: urlParams.get('oldid'),
        newrevision: diffId,
        element: prevLink.parentNode.parentNode
      });
      // No prev link means we are at the page's first revision
    } else if (curLink) {
      linkCounter += 1;
      urlParams = new URLSearchParams(curLink.href);
      revisions.push({
        // We do not actually want to compare to the current revision. We extract
        // the oldid and treat like a new page.
        oldrevision: urlParams.get('oldid'),
        element: curLink.parentNode.parentNode
      });
    }
  }

  // Finally, to get to this point, you are on a page history with only
  // one revision, and therefore no links of any kind. Extract first (and
  // only) revision ID from page title.
  if (linkCounter == 0) {
    const pageID = mw.config.get('wgArticleId');
    const firstID = await fetchFirstRevisionId(pageID);
    revisions.push({
      oldrevision: firstID,
      element: entriesContainers[0]
    });
  }

  return revisions;
}

async function buildURL(params) {
  const url = new URL(endpoint);
  Object.keys(params).forEach(key => url.searchParams.append(key, params[key]));
  return url;
}

function getRootDomain(hostname, publicSuffixSet) {
  const domainParts = hostname.split('.');
  for (let i = 0; i < domainParts.length; i++) {
    const candidate = domainParts.slice(i).join('.');
    if (publicSuffixSet.has(candidate) || publicSuffixSet.has(`!${candidate}`)) {
      return domainParts.slice(i - 1).join('.');
    }
  }
  return hostname;
}

function extractAddedURLs(addedParts) {
  const addedURLs = [];
  const urlRegex = /https?:\/\/[^\s<"]+/g;
  let match;
  while ((match = urlRegex.exec(addedParts)) !== null) {
    addedURLs.push(match[0]);
  }
  return addedURLs;
}

async function fetchFromActionAPI(params) {
  const url = await buildURL(params);

  try {
    const response = await fetch(url);
    if (!response.ok) {
      throw new Error(`Network response was not ok: ${response.statusText}`);
    }
    return await response.json();
  } catch (error) {
    console.error('Error fetching data from MediaWiki API:', error);
    throw error;
  }
}

async function fetchPublicSuffixList() {
  const pslUrl = `https://${LANGUAGE}.${FAMILY}.org/wiki/${publicSuffixList}?action=raw`
  try {
    const response = await fetch(pslUrl);
    const content = await response.text();
    const suffixSet = new Set();
    const lines = content.split('\n');
    for (const line of lines) {
      if (line.trim() && !line.trim().startsWith('//')) {
        suffixSet.add(line.trim());
      }
    }
    return suffixSet;
  } catch (error) {
    console.error("Error fetching Public Suffix List:", error);
    return new Set();
  }
}

async function fetchDiffFromAPI(apiUrl) {
  try {
    const response = await fetch(apiUrl);
    const data = await response.json();
    return data["source"] || data["diff"];
  } catch (error) {
    console.error('Error fetching API content:', error);
    return null;
  }
}

async function fetchDiffAndProcess(revision) {
  let apiUrl = `${restApiEndpoint}/${FAMILY}/${LANGUAGE}/revision/${revision.oldrevision}`;
  if (revision.newrevision !== undefined) {
    apiUrl += `/compare/${revision.newrevision}`;
  }
  const diff = await fetchDiffFromAPI(apiUrl);
  let addedURLs = [];

  if (Array.isArray(diff)) { // actual diffs are arrays; new pages are strings
    // types 2 and 4 represent "from"
    // types 1 and 5 represent "to"
    // type 3 represents changes within a line. it will be harder to extract URL changes in this case
    let fromURLs = [];
    let toURLs = [];

    for (const diffLine of diff) {
      const lineURLs = extractAddedURLs(diffLine.text);
      for (const URL of lineURLs) {
        if (diffLine.type === 2 || diffLine.type === 4) {
          fromURLs.push(URL);
        } else if (diffLine.type === 1 || diffLine.type === 5) {
          toURLs.push(URL);
        }
      }
    }

    const toURLSet = new Set(toURLs);
    addedURLs = fromURLs.filter(url => !toURLSet.has(url));
  } else {
    addedURLs = extractAddedURLs(diff);
  }

  console.log(`Old revision: ${revision.oldrevision}
New revision: ${revision.newrevision}
API URL: ${apiUrl}
Revision element: ${revision.element.innerHTML}
Added URLs: ${addedURLs.join(' ')}
        `);

  const matchedWarnDomains = [];
  const matchedCautionDomains = [];
  const matchedInspectDomains = [];

  for (const url of addedURLs) {
    const hostname = new URL(url).hostname;
    const domain = getRootDomain(hostname, publicSuffixSet);

    if (warnList.has(domain) && !matchedWarnDomains.includes(domain)) {
      matchedWarnDomains.push(domain);
    } else if (cautionList.has(domain) && !matchedCautionDomains.includes(domain)) {
      matchedCautionDomains.push(domain);
    } else if (inspectList.has(domain) && !matchedInspectDomains.includes(domain)) {
      matchedInspectDomains.push(domain);
    }
  }

  if (matchedWarnDomains.length > 0) {
    prependEmojiWithTooltip(revision.element, warnEmoji, matchedWarnDomains, msgWarning);
  }
  if (matchedCautionDomains.length > 0) {
    prependEmojiWithTooltip(revision.element, cautionEmoji, matchedCautionDomains, msgCaution);
  }
  if (matchedInspectDomains.length > 0) {
    prependEmojiWithTooltip(revision.element, inspectEmoji, matchedInspectDomains, msgInspect);
  }
}

async function fetchAndOrganizeDomainLists(pageName) {
  const params = {
    action: 'query',
    prop: 'revisions',
    titles: pageName,
    rvprop: 'content',
    rvslots: '*',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const page = data.query.pages;
    const pageId = Object.keys(page)[0];
    const content = page[pageId].revisions[0].slots.main['*'];

    const warnList = new Set();
    const cautionList = new Set();
    const inspectList = new Set();

    let currentList = null;

    const lines = content.split('\n');
    for (let line of lines) {
      if (line.trim() === warnSectionHeader) {
        currentList = warnList;
      } else if (line.trim() === cautionSectionHeader) {
        currentList = cautionList;
      } else if (line.trim() === inspectSectionHeader) {
        currentList = inspectList;
      }

      if (line.startsWith('*') && currentList) {
        const domain = line.substring(1).trim();
        currentList.add(domain);
      }
    }

    return {
      warnList,
      cautionList,
      inspectList
    };
  } catch (error) {
    console.error('Error fetching or parsing the page content:', error);
    throw error;
  }
}

async function fetchPreviousRevisionId(oldid) {
  const params = {
    action: 'query',
    prop: 'revisions',
    revids: oldid,
    rvprop: 'ids',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const pages = data.query.pages;
    const pageId = Object.keys(pages)[0];
    const revisions = pages[pageId].revisions;

    if (revisions && revisions.length > 0) {
      return revisions[0].parentid; // Get the parent revision ID
    } else {
      throw new Error('No revisions found for the given revision ID.');
    }
  } catch (error) {
    console.error('Error fetching previous revision ID:', error);
    return null;
  }
}

async function fetchFirstRevisionId(pageID) {
  const params = {
    action: 'query',
    pageids: pageID,
    prop: 'revisions',
    rvlimit: 1,
    rvdir: 'newer',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const pages = data.query.pages;
    const pageId = Object.keys(pages)[0];
    const revisions = pages[pageId].revisions;

    if (revisions && revisions.length > 0) {
      return revisions[0].revid;
    } else {
      throw new Error('No revisions found for this page.');
    }
  } catch (error) {
    console.error('Error fetching first revision ID:', error);
    return null;
  }
}

async function fetchDomainListPages(pageName) {
  const params = {
    action: 'query',
    prop: 'revisions',
    titles: pageName,
    rvprop: 'content',
    rvslots: '*',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const page = data.query.pages;
    const pageId = Object.keys(page)[0];
    const content = page[pageId].revisions[0].slots.main['*'];

    const pageTitles = [];
    const lines = content.split('\n');
    for (let line of lines) {
      if (line.startsWith('* [[')) {
        const match = line.match(/\[\[([^\]]+)\]\]/); // Matches the first instance of [[Page Title]]
        if (match) {
          pageTitles.push(match[1]);
        }
      }
    }

    return pageTitles;
  } catch (error) {
    console.error('Error fetching or parsing the page content:', error);
    throw error;
  }
}

async function checkPageExists(pageName) {
  const params = {
    action: 'query',
    titles: pageName,
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const page = data.query.pages;
    const pageId = Object.keys(page)[0];
    return pageId !== "-1"; // Page exists if pageId is not "-1"
  } catch (error) {
    console.error('Error checking page existence:', error);
    return false;
  }
}

async function runScript() {
  publicSuffixSet = await fetchPublicSuffixList();
  if (publicSuffixSet.size === 0) {
    console.error('Public Suffix List loading failed');
    return;
  }

  const listPages = await fetchDomainListPages(listOfLists);
  for (const pageName of listPages) {
    const exists = await checkPageExists(pageName);
    if (exists) {
      const lists = await fetchAndOrganizeDomainLists(pageName);
      lists.warnList.forEach(warnList.add, warnList);
      lists.cautionList.forEach(cautionList.add, cautionList);
      lists.inspectList.forEach(inspectList.add, inspectList);
    }
  }

  const watchlistRevisions = await parseWatchlist();
  for (const revision of watchlistRevisions) {
    await fetchDiffAndProcess(revision);
  }
}

runScript().then(() => console.log('Citation Watchlist script loaded'));