|
|
'use strict' |
|
|
|
|
|
var legacy = require('character-entities-legacy') |
|
|
var invalid = require('character-reference-invalid') |
|
|
var decimal = require('is-decimal') |
|
|
var hexadecimal = require('is-hexadecimal') |
|
|
var alphanumerical = require('is-alphanumerical') |
|
|
var decodeEntity = require('./decode-entity') |
|
|
|
|
|
module.exports = parseEntities |
|
|
|
|
|
var own = {}.hasOwnProperty |
|
|
var fromCharCode = String.fromCharCode |
|
|
var noop = Function.prototype |
|
|
|
|
|
// Default settings. |
|
|
var defaults = { |
|
|
warning: null, |
|
|
reference: null, |
|
|
text: null, |
|
|
warningContext: null, |
|
|
referenceContext: null, |
|
|
textContext: null, |
|
|
position: {}, |
|
|
additional: null, |
|
|
attribute: false, |
|
|
nonTerminated: true |
|
|
} |
|
|
|
|
|
// Characters. |
|
|
var tab = 9 // '\t' |
|
|
var lineFeed = 10 // '\n' |
|
|
var formFeed = 12 // '\f' |
|
|
var space = 32 // ' ' |
|
|
var ampersand = 38 // '&' |
|
|
var semicolon = 59 // ';' |
|
|
var lessThan = 60 // '<' |
|
|
var equalsTo = 61 // '=' |
|
|
var numberSign = 35 // '#' |
|
|
var uppercaseX = 88 // 'X' |
|
|
var lowercaseX = 120 // 'x' |
|
|
var replacementCharacter = 65533 // '<EFBFBD>' |
|
|
|
|
|
// Reference types. |
|
|
var name = 'named' |
|
|
var hexa = 'hexadecimal' |
|
|
var deci = 'decimal' |
|
|
|
|
|
// Map of bases. |
|
|
var bases = {} |
|
|
|
|
|
bases[hexa] = 16 |
|
|
bases[deci] = 10 |
|
|
|
|
|
// Map of types to tests. |
|
|
// Each type of character reference accepts different characters. |
|
|
// This test is used to detect whether a reference has ended (as the semicolon |
|
|
// is not strictly needed). |
|
|
var tests = {} |
|
|
|
|
|
tests[name] = alphanumerical |
|
|
tests[deci] = decimal |
|
|
tests[hexa] = hexadecimal |
|
|
|
|
|
// Warning types. |
|
|
var namedNotTerminated = 1 |
|
|
var numericNotTerminated = 2 |
|
|
var namedEmpty = 3 |
|
|
var numericEmpty = 4 |
|
|
var namedUnknown = 5 |
|
|
var numericDisallowed = 6 |
|
|
var numericProhibited = 7 |
|
|
|
|
|
// Warning messages. |
|
|
var messages = {} |
|
|
|
|
|
messages[namedNotTerminated] = |
|
|
'Named character references must be terminated by a semicolon' |
|
|
messages[numericNotTerminated] = |
|
|
'Numeric character references must be terminated by a semicolon' |
|
|
messages[namedEmpty] = 'Named character references cannot be empty' |
|
|
messages[numericEmpty] = 'Numeric character references cannot be empty' |
|
|
messages[namedUnknown] = 'Named character references must be known' |
|
|
messages[numericDisallowed] = |
|
|
'Numeric character references cannot be disallowed' |
|
|
messages[numericProhibited] = |
|
|
'Numeric character references cannot be outside the permissible Unicode range' |
|
|
|
|
|
// Wrap to ensure clean parameters are given to `parse`. |
|
|
function parseEntities(value, options) { |
|
|
var settings = {} |
|
|
var option |
|
|
var key |
|
|
|
|
|
if (!options) { |
|
|
options = {} |
|
|
} |
|
|
|
|
|
for (key in defaults) { |
|
|
option = options[key] |
|
|
settings[key] = |
|
|
option === null || option === undefined ? defaults[key] : option |
|
|
} |
|
|
|
|
|
if (settings.position.indent || settings.position.start) { |
|
|
settings.indent = settings.position.indent || [] |
|
|
settings.position = settings.position.start |
|
|
} |
|
|
|
|
|
return parse(value, settings) |
|
|
} |
|
|
|
|
|
// Parse entities. |
|
|
// eslint-disable-next-line complexity |
|
|
function parse(value, settings) { |
|
|
var additional = settings.additional |
|
|
var nonTerminated = settings.nonTerminated |
|
|
var handleText = settings.text |
|
|
var handleReference = settings.reference |
|
|
var handleWarning = settings.warning |
|
|
var textContext = settings.textContext |
|
|
var referenceContext = settings.referenceContext |
|
|
var warningContext = settings.warningContext |
|
|
var pos = settings.position |
|
|
var indent = settings.indent || [] |
|
|
var length = value.length |
|
|
var index = 0 |
|
|
var lines = -1 |
|
|
var column = pos.column || 1 |
|
|
var line = pos.line || 1 |
|
|
var queue = '' |
|
|
var result = [] |
|
|
var entityCharacters |
|
|
var namedEntity |
|
|
var terminated |
|
|
var characters |
|
|
var character |
|
|
var reference |
|
|
var following |
|
|
var warning |
|
|
var reason |
|
|
var output |
|
|
var entity |
|
|
var begin |
|
|
var start |
|
|
var type |
|
|
var test |
|
|
var prev |
|
|
var next |
|
|
var diff |
|
|
var end |
|
|
|
|
|
if (typeof additional === 'string') { |
|
|
additional = additional.charCodeAt(0) |
|
|
} |
|
|
|
|
|
// Cache the current point. |
|
|
prev = now() |
|
|
|
|
|
// Wrap `handleWarning`. |
|
|
warning = handleWarning ? parseError : noop |
|
|
|
|
|
// Ensure the algorithm walks over the first character and the end |
|
|
// (inclusive). |
|
|
index-- |
|
|
length++ |
|
|
|
|
|
while (++index < length) { |
|
|
// If the previous character was a newline. |
|
|
if (character === lineFeed) { |
|
|
column = indent[lines] || 1 |
|
|
} |
|
|
|
|
|
character = value.charCodeAt(index) |
|
|
|
|
|
if (character === ampersand) { |
|
|
following = value.charCodeAt(index + 1) |
|
|
|
|
|
// The behaviour depends on the identity of the next character. |
|
|
if ( |
|
|
following === tab || |
|
|
following === lineFeed || |
|
|
following === formFeed || |
|
|
following === space || |
|
|
following === ampersand || |
|
|
following === lessThan || |
|
|
following !== following || |
|
|
(additional && following === additional) |
|
|
) { |
|
|
// Not a character reference. |
|
|
// No characters are consumed, and nothing is returned. |
|
|
// This is not an error, either. |
|
|
queue += fromCharCode(character) |
|
|
column++ |
|
|
|
|
|
continue |
|
|
} |
|
|
|
|
|
start = index + 1 |
|
|
begin = start |
|
|
end = start |
|
|
|
|
|
if (following === numberSign) { |
|
|
// Numerical entity. |
|
|
end = ++begin |
|
|
|
|
|
// The behaviour further depends on the next character. |
|
|
following = value.charCodeAt(end) |
|
|
|
|
|
if (following === uppercaseX || following === lowercaseX) { |
|
|
// ASCII hex digits. |
|
|
type = hexa |
|
|
end = ++begin |
|
|
} else { |
|
|
// ASCII digits. |
|
|
type = deci |
|
|
} |
|
|
} else { |
|
|
// Named entity. |
|
|
type = name |
|
|
} |
|
|
|
|
|
entityCharacters = '' |
|
|
entity = '' |
|
|
characters = '' |
|
|
test = tests[type] |
|
|
end-- |
|
|
|
|
|
while (++end < length) { |
|
|
following = value.charCodeAt(end) |
|
|
|
|
|
if (!test(following)) { |
|
|
break |
|
|
} |
|
|
|
|
|
characters += fromCharCode(following) |
|
|
|
|
|
// Check if we can match a legacy named reference. |
|
|
// If so, we cache that as the last viable named reference. |
|
|
// This ensures we do not need to walk backwards later. |
|
|
if (type === name && own.call(legacy, characters)) { |
|
|
entityCharacters = characters |
|
|
entity = legacy[characters] |
|
|
} |
|
|
} |
|
|
|
|
|
terminated = value.charCodeAt(end) === semicolon |
|
|
|
|
|
if (terminated) { |
|
|
end++ |
|
|
|
|
|
namedEntity = type === name ? decodeEntity(characters) : false |
|
|
|
|
|
if (namedEntity) { |
|
|
entityCharacters = characters |
|
|
entity = namedEntity |
|
|
} |
|
|
} |
|
|
|
|
|
diff = 1 + end - start |
|
|
|
|
|
if (!terminated && !nonTerminated) { |
|
|
// Empty. |
|
|
} else if (!characters) { |
|
|
// An empty (possible) entity is valid, unless it’s numeric (thus an |
|
|
// ampersand followed by an octothorp). |
|
|
if (type !== name) { |
|
|
warning(numericEmpty, diff) |
|
|
} |
|
|
} else if (type === name) { |
|
|
// An ampersand followed by anything unknown, and not terminated, is |
|
|
// invalid. |
|
|
if (terminated && !entity) { |
|
|
warning(namedUnknown, 1) |
|
|
} else { |
|
|
// If theres something after an entity name which is not known, cap |
|
|
// the reference. |
|
|
if (entityCharacters !== characters) { |
|
|
end = begin + entityCharacters.length |
|
|
diff = 1 + end - begin |
|
|
terminated = false |
|
|
} |
|
|
|
|
|
// If the reference is not terminated, warn. |
|
|
if (!terminated) { |
|
|
reason = entityCharacters ? namedNotTerminated : namedEmpty |
|
|
|
|
|
if (settings.attribute) { |
|
|
following = value.charCodeAt(end) |
|
|
|
|
|
if (following === equalsTo) { |
|
|
warning(reason, diff) |
|
|
entity = null |
|
|
} else if (alphanumerical(following)) { |
|
|
entity = null |
|
|
} else { |
|
|
warning(reason, diff) |
|
|
} |
|
|
} else { |
|
|
warning(reason, diff) |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
reference = entity |
|
|
} else { |
|
|
if (!terminated) { |
|
|
// All non-terminated numeric entities are not rendered, and trigger a |
|
|
// warning. |
|
|
warning(numericNotTerminated, diff) |
|
|
} |
|
|
|
|
|
// When terminated and number, parse as either hexadecimal or decimal. |
|
|
reference = parseInt(characters, bases[type]) |
|
|
|
|
|
// Trigger a warning when the parsed number is prohibited, and replace |
|
|
// with replacement character. |
|
|
if (prohibited(reference)) { |
|
|
warning(numericProhibited, diff) |
|
|
reference = fromCharCode(replacementCharacter) |
|
|
} else if (reference in invalid) { |
|
|
// Trigger a warning when the parsed number is disallowed, and replace |
|
|
// by an alternative. |
|
|
warning(numericDisallowed, diff) |
|
|
reference = invalid[reference] |
|
|
} else { |
|
|
// Parse the number. |
|
|
output = '' |
|
|
|
|
|
// Trigger a warning when the parsed number should not be used. |
|
|
if (disallowed(reference)) { |
|
|
warning(numericDisallowed, diff) |
|
|
} |
|
|
|
|
|
// Stringify the number. |
|
|
if (reference > 0xffff) { |
|
|
reference -= 0x10000 |
|
|
output += fromCharCode((reference >>> (10 & 0x3ff)) | 0xd800) |
|
|
reference = 0xdc00 | (reference & 0x3ff) |
|
|
} |
|
|
|
|
|
reference = output + fromCharCode(reference) |
|
|
} |
|
|
} |
|
|
|
|
|
// Found it! |
|
|
// First eat the queued characters as normal text, then eat an entity. |
|
|
if (reference) { |
|
|
flush() |
|
|
|
|
|
prev = now() |
|
|
index = end - 1 |
|
|
column += end - start + 1 |
|
|
result.push(reference) |
|
|
next = now() |
|
|
next.offset++ |
|
|
|
|
|
if (handleReference) { |
|
|
handleReference.call( |
|
|
referenceContext, |
|
|
reference, |
|
|
{start: prev, end: next}, |
|
|
value.slice(start - 1, end) |
|
|
) |
|
|
} |
|
|
|
|
|
prev = next |
|
|
} else { |
|
|
// If we could not find a reference, queue the checked characters (as |
|
|
// normal characters), and move the pointer to their end. |
|
|
// This is possible because we can be certain neither newlines nor |
|
|
// ampersands are included. |
|
|
characters = value.slice(start - 1, end) |
|
|
queue += characters |
|
|
column += characters.length |
|
|
index = end - 1 |
|
|
} |
|
|
} else { |
|
|
// Handle anything other than an ampersand, including newlines and EOF. |
|
|
if ( |
|
|
character === 10 // Line feed |
|
|
) { |
|
|
line++ |
|
|
lines++ |
|
|
column = 0 |
|
|
} |
|
|
|
|
|
if (character === character) { |
|
|
queue += fromCharCode(character) |
|
|
column++ |
|
|
} else { |
|
|
flush() |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
// Return the reduced nodes. |
|
|
return result.join('') |
|
|
|
|
|
// Get current position. |
|
|
function now() { |
|
|
return { |
|
|
line: line, |
|
|
column: column, |
|
|
offset: index + (pos.offset || 0) |
|
|
} |
|
|
} |
|
|
|
|
|
// “Throw” a parse-error: a warning. |
|
|
function parseError(code, offset) { |
|
|
var position = now() |
|
|
|
|
|
position.column += offset |
|
|
position.offset += offset |
|
|
|
|
|
handleWarning.call(warningContext, messages[code], position, code) |
|
|
} |
|
|
|
|
|
// Flush `queue` (normal text). |
|
|
// Macro invoked before each entity and at the end of `value`. |
|
|
// Does nothing when `queue` is empty. |
|
|
function flush() { |
|
|
if (queue) { |
|
|
result.push(queue) |
|
|
|
|
|
if (handleText) { |
|
|
handleText.call(textContext, queue, {start: prev, end: now()}) |
|
|
} |
|
|
|
|
|
queue = '' |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
// Check if `character` is outside the permissible unicode range. |
|
|
function prohibited(code) { |
|
|
return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff |
|
|
} |
|
|
|
|
|
// Check if `character` is disallowed. |
|
|
function disallowed(code) { |
|
|
return ( |
|
|
(code >= 0x0001 && code <= 0x0008) || |
|
|
code === 0x000b || |
|
|
(code >= 0x000d && code <= 0x001f) || |
|
|
(code >= 0x007f && code <= 0x009f) || |
|
|
(code >= 0xfdd0 && code <= 0xfdef) || |
|
|
(code & 0xffff) === 0xffff || |
|
|
(code & 0xffff) === 0xfffe |
|
|
) |
|
|
}
|
|
|
|