User:V111P/js/wikiParserV.js

< User:V111P‎ | js
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/*
 * wikiParserV.js
 * ver. 2013-11-02
 * Home: http://en.wikipedia.org/wiki/User:V111P/js/wikiParserV
 *
 * This is a library of useful functions, mostly for working with wiki code.
 * Includes functions for removing html tags.
 *
 * You can use the code in this script under the
 * Creative Commons Attribution 3.0 Unported License (CC-BY 3.0)
 * http://creativecommons.org/licenses/by/3.0/
 * If you do use it, please let me know. Thanks.
 */

mediaWiki.libs.wikiParserV = window.wikiParser = (function () {
	"use strict";

	var version = 1000;
	var re = {
		escForRegExpG: /[.*+?^$|()[\]{\\^$]/g,
		testRe: /<(?!\/?(a|b)>)/g,
		nonAlphanumericAndHyphenCharsG: /[^A-Za-z0-9_-]/g,
		htmlCommentsG: /(\n)?<!--[\S\s]*?-->\1?/g // replace it with $1
	};
	var locale = {};  // used in removeElements()
	var $tempDiv = $('<div/>'); // used in unescapeCharEntities()
	var wgScriptPath;
	var sectionNameUriEncodingAdditionalReplacements;


	function unescapeCharEntities(str) {
		return $tempDiv.html(str.replace('<', '&lt;').replace('>', '&gt;')).text();
	}


	function formatUrl(article, noredir, edit) {
		wgScriptPath = mw.config.get('wgScriptPath');
		article = article.replace(/ /g, '_');
		var pagePlusHash = article.match(/(.+)#(.+)/);
		if (pagePlusHash)
			article = encodeURIComponent(pagePlusHash[1]) + '#'
				+ encodeURIComponent(pagePlusHash[2]).replace(/%/g, '.');
		if (noredir)
			return wgScriptPath + '/index.php?title=' + article + '&redirect=no';
		else if (edit)
			return wgScriptPath + '/index.php?title='
			       + article.replace(/#.*/, '') + '&action=edit';
		else
			return '/wiki/' + article;
	} // formatUrl


	function encodeSectionNameForUrl(str) {
		var res = sectionNameUriEncodingAdditionalReplacements
			|| (sectionNameUriEncodingAdditionalReplacements = [
				{re: /~/g,  newVal: '.7E'},
				{re: /!/g,  newVal: '.21'},
				{re: /\*/g, newVal: '.2A'},
				{re: /\(/g, newVal: '.28'},
				{re: /\)/g,  newVal: '.29'},
				{re: /\'/g, newVal: '.27'},
				{re:/%3A/g, newVal: ':'}
			]);

		var str = encodeURIComponent(str.replace(/ /g, '_'));
		$.each(res, function (i, val) {
			str = str.replace(val.re, val.newVal);
		});

		return str.replace(/%/g, '.');
	} // encodeSectionNameForUrl


	function encodeSectionNameForId(str) {
		str = encodeSectionNameForUrl(str.replace(/\./g, '_46'))
		.replace(/:/, '_3A')
		.replace(re.nonAlphanumericAndHyphenCharsG, '_');
		return str;
	} // encodeSectionNameForId


	function escapeForRegExp(str) {
		return str.replace(re.escForRegExpG, '\\$&');
	} // escapeForRegExp


	// pretreat for embeded elements with the same closing tag
	function removeElRegExp(startTag, endTag, startTagOfEmbededEl) {
		var res = {pretreat: null, main: null};
		var startTagEsc = escapeForRegExp(startTag)
			.replace(/<<</g, '(').replace(/@@@/g, '|').replace(/>>>/g, ')');
		var endTagEsc = escapeForRegExp(endTag);
		if (startTagOfEmbededEl) {
			var startTagOfEmbededElEsc = escapeForRegExp(startTagOfEmbededEl);
			res.pretreat = new RegExp('(' + startTagEsc + '(?:(?!' + endTagEsc + ')[\\S\\s])*?)'
			  + startTagOfEmbededElEsc + '(?:(?!' +  startTagOfEmbededElEsc + ')[\\S\\s])*?'
			  + endTagEsc, 'gi');
		}
		res.main = new RegExp('(\\n)?' + startTagEsc + '((?!' + startTagEsc + '|' + endTagEsc + ')[\\S\\s])*'
		           + endTagEsc + '\\1?', 'gi');
		return res;
	} // removeElRegExp


	// startTagOfEmbededEl - needed because for example files and wiki links have the same
	// closing tags, so to remove files, pass '[[File:' as startTag and '[[' as startTagOfEmbededEl
	function removeElRegExpStartArr(startTagPre, startTagArr, startTagPost,
									endTag, startTagOfEmbededEl) {
		var st = startTagPre + '<<<' + startTagArr.join('@@@') + '>>>' + startTagPost;
		return removeElRegExp(st, endTag, startTagOfEmbededEl);
	} // removeElRegExpStartArr


	function removeEls(data, res, iterationLimit) {
		var prev, cntr;
		iterationLimit = iterationLimit || 1000;
		if (res.pretreat) {
			cntr = iterationLimit;
			do {
				cntr--; // anti infinite-loop var just in case...
				prev = data;
				data = data.replace(res.pretreat, '$1');
			} while (data != prev && cntr > 0);
		}
		cntr = iterationLimit;
		do {
			cntr--;
			prev = data;
			data = data.replace(res.main, '$1');
		} while (data != prev && cntr > 0);
		return data;
	} // removeEls


	// saves all versions of some namespace names
	function saveNsNames() {
		locale.specialNsArr = [];
		locale.fileNsArr = [];
		locale.categoryNsArr = [];
		$.each(mw.config.get('wgNamespaceIds'), function (key, val) {
			if (val == '-1') { // 'special'
				if ($.inArray(key, locale.specialNsArr) == -1)
					locale.specialNsArr.push(key);
			}
			else if (val == '6' || val == '-2')  { // 'file'/'image' or 'media'
				if ($.inArray(key, locale.fileNsArr) == -1)
					locale.fileNsArr.push(key);
			}
			else if (val == '14') { // 'category'
				if ($.inArray(key, locale.categoryNsArr) == -1)
					locale.categoryNsArr.push(key);
			}
		});
	} // saveNsNames


	// won't work in all cases
	function escCharsForNowikiTags(data) {
		var nowikiCharTranslMap = {
			'[': '&#91;', ']': '&#93;', '{': '&#123;', '}': '&#125;',
			'<': '&lt;', '>': '&gt;', ':': '&#58;', '*': '&#42;', '#': '&#35;'
		};

		//en.wikipedia.org/wiki/Help:Nowiki#WP:NOWIKI

		var singleCharEscReG = re.singleCharEscG
			|| (re.singleCharEscG = /(.|^)(?:nowiki ?\/|nowiki><\/nowiki)>(.)/g);
		data = data.replace(singleCharEscReG, function (m, $1, $2) {
			if ($1 == '<') return '&lt;' + $2;
			else if (nowikiCharTranslMap[$2]) return $1 + nowikiCharTranslMap[$2];
			else if (nowikiCharTranslMap[$1]) return nowikiCharTranslMap[$1] + $2;
		});

		var noWikiElReG = re.noWikiElG || (re.noWikiElG = /<(nowiki|pre)>([\S\s]*?)<\/\1>/g);
		var noWikiReplaceCharsReG = re.noWikiReplG || (re.noWikiReplG = /\[|]|\{|}|<|>|:|\*|#/g);
		data = data.replace(noWikiElReG, function (match, $1, $2) {
			return $2.replace(noWikiReplaceCharsReG, function (match) {
				return nowikiCharTranslMap[$2];
		})});

		return data;
	} // escCharsForNowikiTags


	function removeElements(data, elStr) {
		var arr = elStr.split(', ');

		if ($.inArray('comments', arr) > -1)
			data = data.replace(re.htmlCommentsG, '$1');
		if ($.inArray('tables', arr) > -1) {
			data = removeEls(data, re.wikiTable
				|| (re.wikiTable = removeElRegExp('{|', '|}')));
			data = removeEls(data, re.htmlTable
				|| (re.htmlTable = removeElRegExp('<table', '</table>')));
		}
		if ($.inArray('templates', arr) > -1)
			data = removeEls(data, re.templates
				|| (re.templates = removeElRegExp('{{', '}}') ));
		if ($.inArray('references', arr) > -1)
			data = data.replace(re.refs
				|| (re.refs = /<ref[^>]*?(\/>|>[\S\s]*?<\/ref\s*>)/ig), '');
		if ($.inArray('files', arr) > -1) {
			if (!locale.fileNsArr)
				saveNsNames();
			data = removeEls(data, re.files
				|| (re.files = removeElRegExpStartArr('[[', locale.fileNsArr, ':', ']]', '[[')));
			data = data.replace(re.gallery
				|| (re.gallery = /(\n)?<gallery[^>]*>[\S\s]*?<\/gallery>\1?/gi), '$1');
		}
		if ($.inArray('categories', arr) > -1) {
			if (!locale.categoryNsArr)
				saveNsNames();
			data = removeEls(data, re.category
				|| (re.category = removeElRegExpStartArr('[[', locale.categoryNsArr, ':', ']]')));
		}
		if ($.inArray('bold/italic', arr) > -1) {
			data = data.replace(re.boldItalicG
				|| (re.boldItalicG = /<\/?(i|b|strong|em)>|'''?|(&#39;){2,3}/gi), '');
		}
		if ($.inArray('behavior switches', arr) > -1) {
			data = data.replace(re.behaviorSwitchesG
				|| (re.behaviorSwitchesG = /(\n)?__[^\s]+?__\1?/g), '$1');
		}
		if ($.inArray('others', arr) > -1) {
			data = data.replace(re.timelineG
				|| (re.timelineG = /(\n)?<timeline>[\S\s]*?<\/timeline>\1?/gi), '$1');
		}

		return data;
	} // removeElements;


	// all files ([[File:...]]) must be removed BEFORE calling this function
	function unlink(data) {
		// remove all wikilinks and files
		var prev, cntr = 1000;
		var remAddrReG = re.remAddrG || (re.remAddr = /\[\[[^|\]]*\|/g);
		var unlinkLinksReG = re.unlinkLinksReG || (re.unlinkLinksReG = /\[\[([^\]\[]+)\]\]/g);
		do {
			cntr--;
			prev = data;
			// remove addresses from all links:
			data = data.replace(remAddrReG, '[[');
		} while (data != prev && cntr > 0);

		// unlink all links:
		data = data.replace(unlinkLinksReG, '$1');
		return data;
	} // unlink


	function boldAndItalicToHtml(data) {
		if (!re.boldAndItalicToHtml1) {
			// the first regex removes four, six, or more apostrophes
			re.boldAndItalicToHtml1 = /(^|[^'])''''('{2,})?([^']|$)/g;
			re.boldAndItalicToHtml2 = /'''([^'\n][^\n]*?)('''|\n)/g;
			re.boldAndItalicToHtml3 = /''([^\n]+?)(''|\n)/g;
		}

		return data.replace(re.boldAndItalicToHtml1, '')
			.replace(re.boldAndItalicToHtml2, '<b>$1</b>')
			.replace(re.boldAndItalicToHtml3, '<i>$1</i>');
	} // boldAndItalicToHtml


	function beforeTheFirstSection(data, removeCategories) {
		var tempArr;
		// keep only the text before the start of the first section title
		// (section titles starts with = on a new line).
		// If there are no sections, remove the categories
		var beforeFirstSectRe = re.beforeFirstSect
			|| (re.beforeFirstSect = /^([\S\s]*?)(?=(\n(=+).+?\3[^\S\n]*)(\n|$))/);
		var newData = (tempArr = beforeFirstSectRe.exec(data)) && tempArr[1];
		return newData || (removeCategories ? removeElements(data, 'categories') : data);
	} // beforeTheFirstSection


	function divideSections(data) {
		var sections = [];
		sections.push({
			eq: '',
			level: 0,
			heading: '',
			contents: beforeTheFirstSection(data, false)
		});
		var match;
		var regex = re.divSectionsG || 
			(re.divSectionsG = /(^|\n)(=+)(.+?)\2[^\S\n]*(?=\n)([\S\s]*?)(?=\n(=+).+?\5[^\S\n]*(?:\n|$)|$)/g);
		var cntr = 1000;
	    while ((match = regex.exec(data)) && cntr > 0) {
			cntr--;
	        sections.push({
				eq: match[2],
				level: match[2].length,
				heading: $.trim(match[3]),
				contents: $.trim(match[4])
			});
		}

		return sections;
	} // divideSections


	function checkRegexSupport() {
		return ('<a><bd</e></b>'.replace(re.testRe, '&lt;') == '<a>&lt;bd&lt;/e></b>');
	}


	// removes html tags and some whole elements, except
	// for the tags in the comma+space-separated whiteListTagsStr list
	// Removes all the attributes from the white-listed tags tags.
	// Converts < before a whitespace character into &lt;
	function sanitizeHtml(data, whiteListTagsStr, leaveSpecialChars) {

		if (!checkRegexSupport())
			throw 1; // no (lookahead) regex support

		var whiteList = (whiteListTagsStr || '').split(', ').join('|');
		var commentReG = re.htmlCommentG || (re.htmlCommentG = /<!--[\S\s]*?-->/g);
		var nonWhiteListedTagsReG, allTagsG;
		var lessThanNotBeforeWLTagG;
		var grThanNotAndAfterWLTagG;
		var tagAttributesReG;
		var oldData, cntr;

		if (whiteList !== '') {
			var byAll = re.resByWhitelist = (re.resByWhitelist || {});
			var by = byAll[whiteListTagsStr] || (byAll[whiteListTagsStr] = {});

			nonWhiteListedTagsReG = by.nonWhiteListedTagsG
				|| (by.nonWhiteListedTagsG = new RegExp('<(?!/?(' + whiteList + ')(\\b|/))[^>]*>', 'gi'));
			lessThanNotBeforeWLTagG = by.lessThanNotBeforeWLTagG
				|| (by.lessThanNotBeforeWLTagG = new RegExp('<(?!/?(' + whiteList + ')/?>)', 'gi'));
			grThanNotAndAfterWLTagG = by.grThanNotAndAfterWLTagG
				|| (by.grThanNotAndAfterWLTagG = new RegExp('(</?(' + whiteList + ')/?)?>', 'gi'));
			tagAttributesReG = re.tagAttributesG
				|| (re.tagAttributesG = /<(\/?[a-z][a-z0-9]*)[^>]*?(\/)?>/gi);
		}
		else
			allTagsG = re.allTagsG || (re.allTagsG = /<(\b|\/)[^>]*>/g);

		cntr = 1000;
		do {
			oldData = data;
			cntr--;
			// remove comments:
			data = data.replace(re.htmlCommentsG, '$1');
			// remove all tags except the white-listed ones
			if (whiteList !== '') {
				data = data.replace(nonWhiteListedTagsReG, '');													
				// remove all attributes from the remaining tags:
				data = data.replace(tagAttributesReG, '<$1$2>');
			}
			else
				data = data.replace(allTagsG, '');
		} while (oldData != data && cntr > 0);
		if (cntr <= 0) throw 2;
		if (!leaveSpecialChars) {
			var ampNotInCharRefReG = re.ampReG || (re.ampReG = /&(?!#?[xX]?[a-zA-Z0-9]+;)/g);
			var ltReG = /</g;
			var gtReG = />/g;
			var quoteReG = /"/g;
			var aposReG = /'/g;
			var graveReG = /`/g;
			cntr = 1000;
			do {
				oldData = data;
				cntr--;
				if (whiteList !== '') {
					// html-escape all < and > except if part of a whitelisted tag
					data = data.replace(lessThanNotBeforeWLTagG, '&lt;');
					data = data.replace(grThanNotAndAfterWLTagG, function ($0, $1) {
						return $1 ? $0 : '&gt;';
					});
				}
				else { // html-escape all < and > chars
					data = data.replace(ltReG, '&lt;').replace(gtReG, '&gt;');
				}
				// escape & to &amp; if obviously not a part of a char ref:
				data = data.replace(ampNotInCharRefReG, '&amp;');
				// escape all quotes (` is used in old IE)
				data = data.replace(quoteReG, '&quot;').replace(aposReG, '&#39;')
					.replace(graveReG, '&#96;');
			} while (oldData != data && cntr > 0);
			if (cntr <= 0) throw 2;
		}

		return data;
	} // sanitizeHtml


	function focusedSegment(bsa, segmentNames) {
		segmentNames = (typeof segmentNames == 'object') ? segmentNames : segmentNames.split(', ');
		for (var i = 0; i < segmentNames.length; i++) {
			if (segmentNames[i] == 'wikilink')
				return focusedCustomSegment(bsa, '[[', ']]', '', '[]<>{}');
		}
	}


	// bsa - an array with 3 elements: [text_before_the_selection/cursor, selection, text_after]
	// the other arguments - the char(s) indicating the start/end of the segment
	// otherStartChars (optional) - start chars of other segments with the same endChars,
	//    needed only for some elements, for example if startChars is [[File:,
	//    otherStartChars needs to be [[ because links can be embeded in file elements.
	// invalidBeforePipe - a string with individual illegal characters. Illigal only if before
	//    the first pipe character "|" (or anywhere, if there is no pipe character).
	function focusedCustomSegment(bsa, startChars, endChars, otherStartChars, invalidBeforePipe) {

		function endMatches(str, endChars) {
			return (str.slice(-endChars.length) === endChars);
		}

		function startMatches(str, startChars) {
			return (str.slice(0, startChars.length) === startChars);
		}

		var before = bsa[0];
		var selection = bsa[1]; // the selection
		var after = bsa[2];
		var spaces;

		if (!startChars || !endChars)
			return;

		if (selection) { // there is some selected text
			spaces = selection.match(/^\s+/);
			if (spaces) { // spaces at the beginning of the selected text
	
				if (endMatches(before, startChars)) {
					selection = startChars + selection;
					before = before.slice(0, -startChars.length);
				}
				else {
					// move the spaces to the end of the text-before-the-selection:
					before += spaces[0];
					selection = selection.slice(spaces[0].length);
					// check for startChars at beginning of selection:
					if (!startMatches(selection, startChars))
						return;
				}
			}
			else {
				// while no (complete) startChars string at beginning of selection:
				// move a char from the end of textBefore to the beginning of selection
				var startCharsFound = false;
				for (i = 0; i <= startChars.length; i++) {
					if (startMatches(selection, startChars)) {
						startCharsFound = true;
						break;
					}
					if (before.length == 0)
						break;
					selection = before.slice(before.length - 1) + selection;
					before = before.slice(0, before.length - 1);
				}
				if (!startCharsFound)
					return;

				// TODO: check if selection contains only one outer element,
				//        and the start-end chars are ballanced
			}

			spaces = selection.match(/\s+$/);
			if (spaces) { // spaces at the end of the selected text
				if (startMatches(after, endChars)) {
					selection = selection + endChars;
					after = after.slice(endChars.length);
				}
				else {
					// move spaced to the beginning of the text-after-the-selection:
					after = spaces[0] + after;
					selection = selection.slice(0, -spaces[0].length);
					if (!endMatches(selection, endChars))
						return;
				}
			}
			else {
				// while no (complete) endChars string found at end of selection:
				// move a char from the beginning of textBefore to the end of selection
				var endCharsFound = false;
				for (i = 0; i <= endChars.length; i++) {
					if (endMatches(selection, endChars)) {
						endCharsFound = true;
						break;
					}
					if (after.length == 0)
						break;
					selection = selection + after.charAt(0);
					after = after.slice(1);
				}
				if (!endCharsFound)
					return;
			}
		} // if (selection)
		else { // no text selected
			var text = before + after;
			// TODO: add a loop to allow the cursor to be after an embeded element
			var startCharsAt = text.lastIndexOf(startChars, before.length + startChars.length - 3);
			if (startCharsAt == -1)
				return;
			var closing = startCharsAt;
			var opening = startCharsAt;
			var openingOther;
			var i = 0;
			while (i++ < 10) {
				closing = text.indexOf(endChars, closing + 1);
				if (closing == -1) {
					return;
				}
				if (otherStartChars) {
					openingOther = text.indexOf(otherStartChars, opening);
				}
				opening = text.indexOf(startChars, opening + 1);
				if (opening == -1)
					opening = text.length;
				if (otherStartChars) {
					if (openingOther > -1)
						opening = (openingOther < opening ? openingOther : opening);
				}
				if (closing < opening) {
					if (closing < before.length - endChars.length) {
						return;
					}
					selection = text.slice(startCharsAt, closing + startChars.length);
					before = text.slice(0, startCharsAt);
					after = text.slice(closing + startChars.length);
					break;
				}

			}
		}

		if (invalidBeforePipe) {
			var invalidEscForRe = escapeForRegExp(invalidBeforePipe);
			var beforePipe = selection.slice(startChars.length, -endChars.length).match(/[^|]*/)[0];
			if (beforePipe.match('[' + invalidEscForRe + ']'))
				return;
		}

		return [before, selection, after];
	} // focusedSegment


	return {
		version: version,
		unescapeCharEntities: unescapeCharEntities,
		formatUrl: formatUrl,
		encodeSectionNameForUrl: encodeSectionNameForUrl,
		encodeSectionNameForId: encodeSectionNameForId,
		checkRegexSupport: checkRegexSupport,
		escCharsForNowikiTags: escCharsForNowikiTags,
		removeElRegExp: removeElRegExp,
		removeElRegExpStartArr: removeElRegExpStartArr,
		removeElements: removeElements,
		unlink: unlink,
		sanitizeHtml: sanitizeHtml,
		boldAndItalicToHtml: boldAndItalicToHtml,
		beforeTheFirstSection: beforeTheFirstSection,
		divideSections: divideSections,
		focusedCustomSegment: focusedCustomSegment, // incomplete implementation
		focusedSegment: focusedSegment // works only for wikilinks right now
	};
})();