User:V111P/js/wikiParserV.js

Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump.
This code will be executed when previewing this page.
This user script seems to have a documentation page at User:V111P/js/wikiParserV.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/*
 * wikiParserV.js
 * ver. 2013-11-02
 * Home: http://en.wikipedia.org/wiki/User:V111P/js/wikiParserV
 *
 * This is a library of useful functions, mostly for working with wiki code.
 * Includes functions for removing html tags.
 *
 * You can use the code in this script under the
 * Creative Commons Attribution 3.0 Unported License (CC-BY 3.0)
 * http://creativecommons.org/licenses/by/3.0/
 * If you do use it, please let me know. Thanks.
 */

mediaWiki.libs.wikiParserV = window.wikiParser = (function () {
	"use strict";

	var version = 1000;
	var re = {
		escForRegExpG: /[.*+?^$|()[\]{\\^$]/g,
		testRe: /<(?!\/?(a|b)>)/g,
		nonAlphanumericAndHyphenCharsG: /[^A-Za-z0-9_-]/g,
		htmlCommentsG: /(\n)?<!--[\S\s]*?-->\1?/g // replace it with $1
	};
	var locale = {};  // used in removeElements()
	var $tempDiv = $('<div/>'); // used in unescapeCharEntities()
	var wgScriptPath;
	var sectionNameUriEncodingAdditionalReplacements;


	function unescapeCharEntities(str) {
		return $tempDiv.html(str.replace('<', '&lt;').replace('>', '&gt;')).text();
	}


	function formatUrl(article, noredir, edit) {
		wgScriptPath = mw.config.get('wgScriptPath');
		article = article.replace(/ /g, '_');
		var pagePlusHash = article.match(/(.+)#(.+)/);
		if (pagePlusHash)
			article = encodeURIComponent(pagePlusHash[1]) + '#'
				+ encodeURIComponent(pagePlusHash[2]).replace(/%/g, '.');
		if (noredir)
			return wgScriptPath + '/index.php?title=' + article + '&redirect=no';
		else if (edit)
			return wgScriptPath + '/index.php?title='
			       + article.replace(/#.*/, '') + '&action=edit';
		else
			return '/wiki/' + article;
	} // formatUrl


	function encodeSectionNameForUrl(str) {
		var res = sectionNameUriEncodingAdditionalReplacements
			|| (sectionNameUriEncodingAdditionalReplacements = [
				{re: /~/g,  newVal: '.7E'},
				{re: /!/g,  newVal: '.21'},
				{re: /\*/g, newVal: '.2A'},
				{re: /\(/g, newVal: '.28'},
				{re: /\)/g,  newVal: '.29'},
				{re: /\'/g, newVal: '.27'},
				{re:/%3A/g, newVal: ':'}
			]);

		var str = encodeURIComponent(str.replace(/ /g, '_'));
		$.each(res, function (i, val) {
			str = str.replace(val.re, val.newVal);
		});

		return str.replace(/%/g, '.');
	} // encodeSectionNameForUrl


	function encodeSectionNameForId(str) {
		str = encodeSectionNameForUrl(str.replace(/\./g, '_46'))
		.replace(/:/, '_3A')
		.replace(re.nonAlphanumericAndHyphenCharsG, '_');
		return str;
	} // encodeSectionNameForId


	function escapeForRegExp(str) {
		return str.replace(re.escForRegExpG, '\\$&');
	} // escapeForRegExp


	// pretreat for embeded elements with the same closing tag
	function removeElRegExp(startTag, endTag, startTagOfEmbededEl) {
		var res = {pretreat: null, main: null};
		var startTagEsc = escapeForRegExp(startTag)
			.replace(/<<</g, '(').replace(/@@@/g, '|').replace(/>>>/g, ')');
		var endTagEsc = escapeForRegExp(endTag);
		if (startTagOfEmbededEl) {
			var startTagOfEmbededElEsc = escapeForRegExp(startTagOfEmbededEl);
			res.pretreat = new RegExp('(' + startTagEsc + '(?:(?!' + endTagEsc + ')[\\S\\s])*?)'
			  + startTagOfEmbededElEsc + '(?:(?!' +  startTagOfEmbededElEsc + ')[\\S\\s])*?'
			  + endTagEsc, 'gi');
		}
		res.main = new RegExp('(\\n)?' + startTagEsc + '((?!' + startTagEsc + '|' + endTagEsc + ')[\\S\\s])*'
		           + endTagEsc + '\\1?', 'gi');
		return res;
	} // removeElRegExp


	// startTagOfEmbededEl - needed because for example files and wiki links have the same
	// closing tags, so to remove files, pass '[[File:' as startTag and '[[' as startTagOfEmbededEl
	function removeElRegExpStartArr(startTagPre, startTagArr, startTagPost,
									endTag, startTagOfEmbededEl) {
		var st = startTagPre + '<<<' + startTagArr.join('@@@') + '>>>' + startTagPost;
		return removeElRegExp(st, endTag, startTagOfEmbededEl);
	} // removeElRegExpStartArr


	function removeEls(data, res, iterationLimit) {
		var prev, cntr;
		iterationLimit = iterationLimit || 1000;
		if (res.pretreat) {
			cntr = iterationLimit;
			do {
				cntr--; // anti infinite-loop var just in case...
				prev = data;
				data = data.replace(res.pretreat, '$1');
			} while (data != prev && cntr > 0);
		}
		cntr = iterationLimit;
		do {
			cntr--;
			prev = data;
			data = data.replace(res.main, '$1');
		} while (data != prev && cntr > 0);
		return data;
	} // removeEls


	// saves all versions of some namespace names
	function saveNsNames() {
		locale.specialNsArr = [];
		locale.fileNsArr = [];
		locale.categoryNsArr = [];
		$.each(mw.config.get('wgNamespaceIds'), function (key, val) {
			if (val == '-1') { // 'special'
				if ($.inArray(key, locale.specialNsArr) == -1)
					locale.specialNsArr.push(key);
			}
			else if (val == '6' || val == '-2')  { // 'file'/'image' or 'media'
				if ($.inArray(key, locale.fileNsArr) == -1)
					locale.fileNsArr.push(key);
			}
			else if (val == '14') { // 'category'
				if ($.inArray(key, locale.categoryNsArr) == -1)
					locale.categoryNsArr.push(key);
			}
		});
	} // saveNsNames


	// won't work in all cases
	function escCharsForNowikiTags(data) {
		var nowikiCharTranslMap = {
			'[': '&#91;', ']': '&#93;', '{': '&#123;', '}': '&#125;',
			'<': '&lt;', '>': '&gt;', ':': '&#58;', '*': '&#42;', '#': '&#35;'
		};

		//en.wikipedia.org/wiki/Help:Nowiki#WP:NOWIKI

		var singleCharEscReG = re.singleCharEscG
			|| (re.singleCharEscG = /(.|^)(?:nowiki ?\/|nowiki><\/nowiki)>(.)/g);
		data = data.replace(singleCharEscReG, function (m, $1, $2) {
			if ($1 == '<') return '&lt;' + $2;
			else if (nowikiCharTranslMap[$2]) return $1 + nowikiCharTranslMap[$2];
			else if (nowikiCharTranslMap[$1]) return nowikiCharTranslMap[$1] + $2;
		});

		var noWikiElReG = re.noWikiElG || (re.noWikiElG = /<(nowiki|pre)>([\S\s]*?)<\/\1>/g);
		var noWikiReplaceCharsReG = re.noWikiReplG || (re.noWikiReplG = /\[|]|\{|}|<|>|:|\*|#/g);
		data = data.replace(noWikiElReG, function (match, $1, $2) {
			return $2.replace(noWikiReplaceCharsReG, function (match) {
				return nowikiCharTranslMap[$2];
		})});

		return data;
	} // escCharsForNowikiTags


	function removeElements(data, elStr) {
		var arr = elStr.split(', ');

		if ($.inArray('comments', arr) > -1)
			data = data.replace(re.htmlCommentsG, '$1');
		if ($.inArray('tables', arr) > -1) {
			data = removeEls(data, re.wikiTable
				|| (re.wikiTable = removeElRegExp('{|', '|}')));
			data = removeEls(data, re.htmlTable
				|| (re.htmlTable = removeElRegExp('<table', '</table>')));
		}
		if ($.inArray('templates', arr) > -1)
			data = removeEls(data, re.templates
				|| (re.templates = removeElRegExp('{{', '}}') ));
		if ($.inArray('references', arr) > -1)
			data = data.replace(re.refs
				|| (re.refs = /<ref[^>]*?(\/>|>[\S\s]*?<\/ref\s*>)/ig), '');
		if ($.inArray('files', arr) > -1) {
			if (!locale.fileNsArr)
				saveNsNames();
			data = removeEls(data, re.files
				|| (re.files = removeElRegExpStartArr('[[', locale.fileNsArr, ':', ']]', '[[')));
			data = data.replace(re.gallery
				|| (re.gallery = /(\n)?<gallery[^>]*>[\S\s]*?<\/gallery>\1?/gi), '$1');
		}
		if ($.inArray('categories', arr) > -1) {
			if (!locale.categoryNsArr)
				saveNsNames();
			data = removeEls(data, re.category
				|| (re.category = removeElRegExpStartArr('[[', locale.categoryNsArr, ':', ']]')));
		}
		if ($.inArray('bold/italic', arr) > -1) {
			data = data.replace(re.boldItalicG
				|| (re.boldItalicG = /<\/?(i|b|strong|em)>|'''?|(&#39;){2,3}/gi), '');
		}
		if ($.inArray('behavior switches', arr) > -1) {
			data = data.replace(re.behaviorSwitchesG
				|| (re.behaviorSwitchesG = /(\n)?__[^\s]+?__\1?/g), '$1');
		}
		if ($.inArray('others', arr) > -1) {
			data = data.replace(re.timelineG
				|| (re.timelineG = /(\n)?<timeline>[\S\s]*?<\/timeline>\1?/gi), '$1');
		}

		return data;
	} // removeElements;


	// all files ([[File:...]]) must be removed BEFORE calling this function
	function unlink(data) {
		// remove all wikilinks and files
		var prev, cntr = 1000;
		var remAddrReG = re.remAddrG || (re.remAddr = /\[\[[^|\]]*\|/g);
		var unlinkLinksReG = re.unlinkLinksReG || (re.unlinkLinksReG = /\[\[([^\]\[]+)\]\]/g);
		do {
			cntr--;
			prev = data;
			// remove addresses from all links:
			data = data.replace(remAddrReG, '[[');
		} while (data != prev && cntr > 0);

		// unlink all links:
		data = data.replace(unlinkLinksReG, '$1');
		return data;
	} // unlink


	function boldAndItalicToHtml(data) {
		if (!re.boldAndItalicToHtml1) {
			// the first regex removes four, six, or more apostrophes
			re.boldAndItalicToHtml1 = /(^|[^'])''''('{2,})?([^']|$)/g;
			re.boldAndItalicToHtml2 = /'''([^'\n][^\n]*?)('''|\n)/g;
			re.boldAndItalicToHtml3 = /''([^\n]+?)(''|\n)/g;
		}

		return data.replace(re.boldAndItalicToHtml1, '')
			.replace(re.boldAndItalicToHtml2, '<b>$1</b>')
			.replace(re.boldAndItalicToHtml3, '<i>$1</i>');
	} // boldAndItalicToHtml


	function beforeTheFirstSection(data, removeCategories) {
		var tempArr;
		// keep only the text before the start of the first section title
		// (section titles starts with = on a new line).
		// If there are no sections, remove the categories
		var beforeFirstSectRe = re.beforeFirstSect
			|| (re.beforeFirstSect = /^([\S\s]*?)(?=(\n(=+).+?\3[^\S\n]*)(\n|$))/);
		var newData = (tempArr = beforeFirstSectRe.exec(data)) && tempArr[1];
		return newData || (removeCategories ? removeElements(data, 'categories') : data);
	} // beforeTheFirstSection


	function divideSections(data) {
		var sections = [];
		sections.push({
			eq: '',
			level: 0,
			heading: '',
			contents: beforeTheFirstSection(data, false)
		});
		var match;
		var regex = re.divSectionsG || 
			(re.divSectionsG = /(^|\n)(=+)(.+?)\2[^\S\n]*(?=\n)([\S\s]*?)(?=\n(=+).+?\5[^\S\n]*(?:\n|$)|$)/g);
		var cntr = 1000;
	    while ((match = regex.exec(data)) && cntr > 0) {
			cntr--;
	        sections.push({
				eq: match[2],
				level: match[2].length,
				heading: $.trim(match[3]),
				contents: $.trim(match[4])
			});
		}

		return sections;
	} // divideSections


	function checkRegexSupport() {
		return ('<a><bd</e></b>'.replace(re.testRe, '&lt;') == '<a>&lt;bd&lt;/e></b>');
	}


	// removes html tags and some whole elements, except
	// for the tags in the comma+space-separated whiteListTagsStr list
	// Removes all the attributes from the white-listed tags tags.
	// Converts < before a whitespace character into &lt;
	function sanitizeHtml(data, whiteListTagsStr, leaveSpecialChars) {

		if (!checkRegexSupport())
			throw 1; // no (lookahead) regex support

		var whiteList = (whiteListTagsStr || '').split(', ').join('|');
		var commentReG = re.htmlCommentG || (re.htmlCommentG = /<!--[\S\s]*?-->/g);
		var nonWhiteListedTagsReG, allTagsG;
		var lessThanNotBeforeWLTagG;
		var grThanNotAndAfterWLTagG;
		var tagAttributesReG;
		var oldData, cntr;

		if (whiteList !== '') {
			var byAll = re.resByWhitelist = (re.resByWhitelist || {});
			var by = byAll[whiteListTagsStr] || (byAll[whiteListTagsStr] = {});

			nonWhiteListedTagsReG = by.nonWhiteListedTagsG
				|| (by.nonWhiteListedTagsG = new RegExp('<(?!/?(' + whiteList + ')(\\b|/))[^>]*>', 'gi'));
			lessThanNotBeforeWLTagG = by.lessThanNotBeforeWLTagG
				|| (by.lessThanNotBeforeWLTagG = new RegExp('<(?!/?(' + whiteList + ')/?>)', 'gi'));
			grThanNotAndAfterWLTagG = by.grThanNotAndAfterWLTagG
				|| (by.grThanNotAndAfterWLTagG = new RegExp('(</?(' + whiteList + ')/?)?>', 'gi'));
			tagAttributesReG = re.tagAttributesG
				|| (re.tagAttributesG = /<(\/?[a-z][a-z0-9]*)[^>]*?(\/)?>/gi);
		}
		else
			allTagsG = re.allTagsG || (re.allTagsG = /<(\b|\/)[^>]*>/g);

		cntr = 1000;
		do {
			oldData = data;
			cntr--;
			// remove comments:
			data = data.replace(re.htmlCommentsG, '$1');
			// remove all tags except the white-listed ones
			if (whiteList !== '') {
				data = data.replace(nonWhiteListedTagsReG, '');													
				// remove all attributes from the remaining tags:
				data = data.replace(tagAttributesReG, '<$1$2>');
			}
			else
				data = data.replace(allTagsG, '');
		} while (oldData != data && cntr > 0);
		if (cntr <= 0) throw 2;
		if (!leaveSpecialChars) {
			var ampNotInCharRefReG = re.ampReG || (re.ampReG = /&(?!#?[xX]?[a-zA-Z0-9]+;)/g);
			var ltReG = /</g;
			var gtReG = />/g;
			var quoteReG = /"/g;
			var aposReG = /'/g;
			var graveReG = /`/g;
			cntr = 1000;
			do {
				oldData = data;
				cntr--;
				if (whiteList !== '') {
					// html-escape all < and > except if part of a whitelisted tag
					data = data.replace(lessThanNotBeforeWLTagG, '&lt;');
					data = data.replace(grThanNotAndAfterWLTagG, function ($0, $1) {
						return $1 ? $0 : '&gt;';
					});
				}
				else { // html-escape all < and > chars
					data = data.replace(ltReG, '&lt;').replace(gtReG, '&gt;');
				}
				// escape & to &amp; if obviously not a part of a char ref:
				data = data.replace(ampNotInCharRefReG, '&amp;');
				// escape all quotes (` is used in old IE)
				data = data.replace(quoteReG, '&quot;').replace(aposReG, '&#39;')
					.replace(graveReG, '&#96;');
			} while (oldData != data && cntr > 0);
			if (cntr <= 0) throw 2;
		}

		return data;
	} // sanitizeHtml


	function focusedSegment(bsa, segmentNames) {
		segmentNames = (typeof segmentNames == 'object') ? segmentNames : segmentNames.split(', ');
		for (var i = 0; i < segmentNames.length; i++) {
			if (segmentNames[i] == 'wikilink')
				return focusedCustomSegment(bsa, '[[', ']]', '', '[]<>{}');
		}
	}


	// bsa - an array with 3 elements: [text_before_the_selection/cursor, selection, text_after]
	// the other arguments - the char(s) indicating the start/end of the segment
	// otherStartChars (optional) - start chars of other segments with the same endChars,
	//    needed only for some elements, for example if startChars is [[File:,
	//    otherStartChars needs to be [[ because links can be embeded in file elements.
	// invalidBeforePipe - a string with individual illegal characters. Illigal only if before
	//    the first pipe character "|" (or anywhere, if there is no pipe character).
	function focusedCustomSegment(bsa, startChars, endChars, otherStartChars, invalidBeforePipe) {

		function endMatches(str, endChars) {
			return (str.slice(-endChars.length) === endChars);
		}

		function startMatches(str, startChars) {
			return (str.slice(0, startChars.length) === startChars);
		}

		var before = bsa[0];
		var selection = bsa[1]; // the selection
		var after = bsa[2];
		var spaces;

		if (!startChars || !endChars)
			return;

		if (selection) { // there is some selected text
			spaces = selection.match(/^\s+/);
			if (spaces) { // spaces at the beginning of the selected text
	
				if (endMatches(before, startChars)) {
					selection = startChars + selection;
					before = before.slice(0, -startChars.length);
				}
				else {
					// move the spaces to the end of the text-before-the-selection:
					before += spaces[0];
					selection = selection.slice(spaces[0].length);
					// check for startChars at beginning of selection:
					if (!startMatches(selection, startChars))
						return;
				}
			}
			else {
				// while no (complete) startChars string at beginning of selection:
				// move a char from the end of textBefore to the beginning of selection
				var startCharsFound = false;
				for (i = 0; i <= startChars.length; i++) {
					if (startMatches(selection, startChars)) {
						startCharsFound = true;
						break;
					}
					if (before.length == 0)
						break;
					selection = before.slice(before.length - 1) + selection;
					before = before.slice(0, before.length - 1);
				}
				if (!startCharsFound)
					return;

				// TODO: check if selection contains only one outer element,
				//        and the start-end chars are ballanced
			}

			spaces = selection.match(/\s+$/);
			if (spaces) { // spaces at the end of the selected text
				if (startMatches(after, endChars)) {
					selection = selection + endChars;
					after = after.slice(endChars.length);
				}
				else {
					// move spaced to the beginning of the text-after-the-selection:
					after = spaces[0] + after;
					selection = selection.slice(0, -spaces[0].length);
					if (!endMatches(selection, endChars))
						return;
				}
			}
			else {
				// while no (complete) endChars string found at end of selection:
				// move a char from the beginning of textBefore to the end of selection
				var endCharsFound = false;
				for (i = 0; i <= endChars.length; i++) {
					if (endMatches(selection, endChars)) {
						endCharsFound = true;
						break;
					}
					if (after.length == 0)
						break;
					selection = selection + after.charAt(0);
					after = after.slice(1);
				}
				if (!endCharsFound)
					return;
			}
		} // if (selection)
		else { // no text selected
			var text = before + after;
			// TODO: add a loop to allow the cursor to be after an embeded element
			var startCharsAt = text.lastIndexOf(startChars, before.length + startChars.length - 3);
			if (startCharsAt == -1)
				return;
			var closing = startCharsAt;
			var opening = startCharsAt;
			var openingOther;
			var i = 0;
			while (i++ < 10) {
				closing = text.indexOf(endChars, closing + 1);
				if (closing == -1) {
					return;
				}
				if (otherStartChars) {
					openingOther = text.indexOf(otherStartChars, opening);
				}
				opening = text.indexOf(startChars, opening + 1);
				if (opening == -1)
					opening = text.length;
				if (otherStartChars) {
					if (openingOther > -1)
						opening = (openingOther < opening ? openingOther : opening);
				}
				if (closing < opening) {
					if (closing < before.length - endChars.length) {
						return;
					}
					selection = text.slice(startCharsAt, closing + startChars.length);
					before = text.slice(0, startCharsAt);
					after = text.slice(closing + startChars.length);
					break;
				}

			}
		}

		if (invalidBeforePipe) {
			var invalidEscForRe = escapeForRegExp(invalidBeforePipe);
			var beforePipe = selection.slice(startChars.length, -endChars.length).match(/[^|]*/)[0];
			if (beforePipe.match('[' + invalidEscForRe + ']'))
				return;
		}

		return [before, selection, after];
	} // focusedSegment


	return {
		version: version,
		unescapeCharEntities: unescapeCharEntities,
		formatUrl: formatUrl,
		encodeSectionNameForUrl: encodeSectionNameForUrl,
		encodeSectionNameForId: encodeSectionNameForId,
		checkRegexSupport: checkRegexSupport,
		escCharsForNowikiTags: escCharsForNowikiTags,
		removeElRegExp: removeElRegExp,
		removeElRegExpStartArr: removeElRegExpStartArr,
		removeElements: removeElements,
		unlink: unlink,
		sanitizeHtml: sanitizeHtml,
		boldAndItalicToHtml: boldAndItalicToHtml,
		beforeTheFirstSection: beforeTheFirstSection,
		divideSections: divideSections,
		focusedCustomSegment: focusedCustomSegment, // incomplete implementation
		focusedSegment: focusedSegment // works only for wikilinks right now
	};
})();