You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
970 lines
28 KiB
970 lines
28 KiB
module.exports = Tokenizer; |
|
|
|
var decodeCodePoint = require("entities/lib/decode_codepoint.js"); |
|
var entityMap = require("entities/maps/entities.json"); |
|
var legacyMap = require("entities/maps/legacy.json"); |
|
var xmlMap = require("entities/maps/xml.json"); |
|
|
|
var i = 0; |
|
|
|
var TEXT = i++; |
|
var BEFORE_TAG_NAME = i++; //after < |
|
var IN_TAG_NAME = i++; |
|
var IN_SELF_CLOSING_TAG = i++; |
|
var BEFORE_CLOSING_TAG_NAME = i++; |
|
var IN_CLOSING_TAG_NAME = i++; |
|
var AFTER_CLOSING_TAG_NAME = i++; |
|
|
|
//attributes |
|
var BEFORE_ATTRIBUTE_NAME = i++; |
|
var IN_ATTRIBUTE_NAME = i++; |
|
var AFTER_ATTRIBUTE_NAME = i++; |
|
var BEFORE_ATTRIBUTE_VALUE = i++; |
|
var IN_ATTRIBUTE_VALUE_DQ = i++; // " |
|
var IN_ATTRIBUTE_VALUE_SQ = i++; // ' |
|
var IN_ATTRIBUTE_VALUE_NQ = i++; |
|
|
|
//declarations |
|
var BEFORE_DECLARATION = i++; // ! |
|
var IN_DECLARATION = i++; |
|
|
|
//processing instructions |
|
var IN_PROCESSING_INSTRUCTION = i++; // ? |
|
|
|
//comments |
|
var BEFORE_COMMENT = i++; |
|
var IN_COMMENT = i++; |
|
var AFTER_COMMENT_1 = i++; |
|
var AFTER_COMMENT_2 = i++; |
|
|
|
//cdata |
|
var BEFORE_CDATA_1 = i++; // [ |
|
var BEFORE_CDATA_2 = i++; // C |
|
var BEFORE_CDATA_3 = i++; // D |
|
var BEFORE_CDATA_4 = i++; // A |
|
var BEFORE_CDATA_5 = i++; // T |
|
var BEFORE_CDATA_6 = i++; // A |
|
var IN_CDATA = i++; // [ |
|
var AFTER_CDATA_1 = i++; // ] |
|
var AFTER_CDATA_2 = i++; // ] |
|
|
|
//special tags |
|
var BEFORE_SPECIAL = i++; //S |
|
var BEFORE_SPECIAL_END = i++; //S |
|
|
|
var BEFORE_SCRIPT_1 = i++; //C |
|
var BEFORE_SCRIPT_2 = i++; //R |
|
var BEFORE_SCRIPT_3 = i++; //I |
|
var BEFORE_SCRIPT_4 = i++; //P |
|
var BEFORE_SCRIPT_5 = i++; //T |
|
var AFTER_SCRIPT_1 = i++; //C |
|
var AFTER_SCRIPT_2 = i++; //R |
|
var AFTER_SCRIPT_3 = i++; //I |
|
var AFTER_SCRIPT_4 = i++; //P |
|
var AFTER_SCRIPT_5 = i++; //T |
|
|
|
var BEFORE_STYLE_1 = i++; //T |
|
var BEFORE_STYLE_2 = i++; //Y |
|
var BEFORE_STYLE_3 = i++; //L |
|
var BEFORE_STYLE_4 = i++; //E |
|
var AFTER_STYLE_1 = i++; //T |
|
var AFTER_STYLE_2 = i++; //Y |
|
var AFTER_STYLE_3 = i++; //L |
|
var AFTER_STYLE_4 = i++; //E |
|
|
|
var BEFORE_ENTITY = i++; //& |
|
var BEFORE_NUMERIC_ENTITY = i++; //# |
|
var IN_NAMED_ENTITY = i++; |
|
var IN_NUMERIC_ENTITY = i++; |
|
var IN_HEX_ENTITY = i++; //X |
|
|
|
var j = 0; |
|
|
|
var SPECIAL_NONE = j++; |
|
var SPECIAL_SCRIPT = j++; |
|
var SPECIAL_STYLE = j++; |
|
|
|
function whitespace(c) { |
|
return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r"; |
|
} |
|
|
|
function ifElseState(upper, SUCCESS, FAILURE) { |
|
var lower = upper.toLowerCase(); |
|
|
|
if (upper === lower) { |
|
return function(c) { |
|
if (c === lower) { |
|
this._state = SUCCESS; |
|
} else { |
|
this._state = FAILURE; |
|
this._index--; |
|
} |
|
}; |
|
} else { |
|
return function(c) { |
|
if (c === lower || c === upper) { |
|
this._state = SUCCESS; |
|
} else { |
|
this._state = FAILURE; |
|
this._index--; |
|
} |
|
}; |
|
} |
|
} |
|
|
|
function consumeSpecialNameChar(upper, NEXT_STATE) { |
|
var lower = upper.toLowerCase(); |
|
|
|
return function(c) { |
|
if (c === lower || c === upper) { |
|
this._state = NEXT_STATE; |
|
} else { |
|
this._state = IN_TAG_NAME; |
|
this._index--; //consume the token again |
|
} |
|
}; |
|
} |
|
|
|
function Tokenizer(options, cbs) { |
|
this._state = TEXT; |
|
this._buffer = ""; |
|
this._sectionStart = 0; |
|
this._index = 0; |
|
this._bufferOffset = 0; //chars removed from _buffer |
|
this._baseState = TEXT; |
|
this._special = SPECIAL_NONE; |
|
this._cbs = cbs; |
|
this._running = true; |
|
this._ended = false; |
|
this._xmlMode = !!(options && options.xmlMode); |
|
this._decodeEntities = !!(options && options.decodeEntities); |
|
} |
|
|
|
Tokenizer.prototype._stateText = function(c) { |
|
if (c === "<") { |
|
if (this._index > this._sectionStart) { |
|
this._cbs.ontext(this._getSection()); |
|
} |
|
this._state = BEFORE_TAG_NAME; |
|
this._sectionStart = this._index; |
|
} else if ( |
|
this._decodeEntities && |
|
this._special === SPECIAL_NONE && |
|
c === "&" |
|
) { |
|
if (this._index > this._sectionStart) { |
|
this._cbs.ontext(this._getSection()); |
|
} |
|
this._baseState = TEXT; |
|
this._state = BEFORE_ENTITY; |
|
this._sectionStart = this._index; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeTagName = function(c) { |
|
if (c === "/") { |
|
this._state = BEFORE_CLOSING_TAG_NAME; |
|
} else if (c === "<") { |
|
this._cbs.ontext(this._getSection()); |
|
this._sectionStart = this._index; |
|
} else if (c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) { |
|
this._state = TEXT; |
|
} else if (c === "!") { |
|
this._state = BEFORE_DECLARATION; |
|
this._sectionStart = this._index + 1; |
|
} else if (c === "?") { |
|
this._state = IN_PROCESSING_INSTRUCTION; |
|
this._sectionStart = this._index + 1; |
|
} else { |
|
this._state = |
|
!this._xmlMode && (c === "s" || c === "S") |
|
? BEFORE_SPECIAL |
|
: IN_TAG_NAME; |
|
this._sectionStart = this._index; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInTagName = function(c) { |
|
if (c === "/" || c === ">" || whitespace(c)) { |
|
this._emitToken("onopentagname"); |
|
this._state = BEFORE_ATTRIBUTE_NAME; |
|
this._index--; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeCloseingTagName = function(c) { |
|
if (whitespace(c)); |
|
else if (c === ">") { |
|
this._state = TEXT; |
|
} else if (this._special !== SPECIAL_NONE) { |
|
if (c === "s" || c === "S") { |
|
this._state = BEFORE_SPECIAL_END; |
|
} else { |
|
this._state = TEXT; |
|
this._index--; |
|
} |
|
} else { |
|
this._state = IN_CLOSING_TAG_NAME; |
|
this._sectionStart = this._index; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInCloseingTagName = function(c) { |
|
if (c === ">" || whitespace(c)) { |
|
this._emitToken("onclosetag"); |
|
this._state = AFTER_CLOSING_TAG_NAME; |
|
this._index--; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateAfterCloseingTagName = function(c) { |
|
//skip everything until ">" |
|
if (c === ">") { |
|
this._state = TEXT; |
|
this._sectionStart = this._index + 1; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeAttributeName = function(c) { |
|
if (c === ">") { |
|
this._cbs.onopentagend(); |
|
this._state = TEXT; |
|
this._sectionStart = this._index + 1; |
|
} else if (c === "/") { |
|
this._state = IN_SELF_CLOSING_TAG; |
|
} else if (!whitespace(c)) { |
|
this._state = IN_ATTRIBUTE_NAME; |
|
this._sectionStart = this._index; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInSelfClosingTag = function(c) { |
|
if (c === ">") { |
|
this._cbs.onselfclosingtag(); |
|
this._state = TEXT; |
|
this._sectionStart = this._index + 1; |
|
} else if (!whitespace(c)) { |
|
this._state = BEFORE_ATTRIBUTE_NAME; |
|
this._index--; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInAttributeName = function(c) { |
|
if (c === "=" || c === "/" || c === ">" || whitespace(c)) { |
|
this._cbs.onattribname(this._getSection()); |
|
this._sectionStart = -1; |
|
this._state = AFTER_ATTRIBUTE_NAME; |
|
this._index--; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateAfterAttributeName = function(c) { |
|
if (c === "=") { |
|
this._state = BEFORE_ATTRIBUTE_VALUE; |
|
} else if (c === "/" || c === ">") { |
|
this._cbs.onattribend(); |
|
this._state = BEFORE_ATTRIBUTE_NAME; |
|
this._index--; |
|
} else if (!whitespace(c)) { |
|
this._cbs.onattribend(); |
|
this._state = IN_ATTRIBUTE_NAME; |
|
this._sectionStart = this._index; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeAttributeValue = function(c) { |
|
if (c === '"') { |
|
this._state = IN_ATTRIBUTE_VALUE_DQ; |
|
this._sectionStart = this._index + 1; |
|
} else if (c === "'") { |
|
this._state = IN_ATTRIBUTE_VALUE_SQ; |
|
this._sectionStart = this._index + 1; |
|
} else if (!whitespace(c)) { |
|
this._state = IN_ATTRIBUTE_VALUE_NQ; |
|
this._sectionStart = this._index; |
|
this._index--; //reconsume token |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c) { |
|
if (c === '"') { |
|
this._emitToken("onattribdata"); |
|
this._cbs.onattribend(); |
|
this._state = BEFORE_ATTRIBUTE_NAME; |
|
} else if (this._decodeEntities && c === "&") { |
|
this._emitToken("onattribdata"); |
|
this._baseState = this._state; |
|
this._state = BEFORE_ENTITY; |
|
this._sectionStart = this._index; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c) { |
|
if (c === "'") { |
|
this._emitToken("onattribdata"); |
|
this._cbs.onattribend(); |
|
this._state = BEFORE_ATTRIBUTE_NAME; |
|
} else if (this._decodeEntities && c === "&") { |
|
this._emitToken("onattribdata"); |
|
this._baseState = this._state; |
|
this._state = BEFORE_ENTITY; |
|
this._sectionStart = this._index; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c) { |
|
if (whitespace(c) || c === ">") { |
|
this._emitToken("onattribdata"); |
|
this._cbs.onattribend(); |
|
this._state = BEFORE_ATTRIBUTE_NAME; |
|
this._index--; |
|
} else if (this._decodeEntities && c === "&") { |
|
this._emitToken("onattribdata"); |
|
this._baseState = this._state; |
|
this._state = BEFORE_ENTITY; |
|
this._sectionStart = this._index; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeDeclaration = function(c) { |
|
this._state = |
|
c === "[" |
|
? BEFORE_CDATA_1 |
|
: c === "-" |
|
? BEFORE_COMMENT |
|
: IN_DECLARATION; |
|
}; |
|
|
|
Tokenizer.prototype._stateInDeclaration = function(c) { |
|
if (c === ">") { |
|
this._cbs.ondeclaration(this._getSection()); |
|
this._state = TEXT; |
|
this._sectionStart = this._index + 1; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInProcessingInstruction = function(c) { |
|
if (c === ">") { |
|
this._cbs.onprocessinginstruction(this._getSection()); |
|
this._state = TEXT; |
|
this._sectionStart = this._index + 1; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeComment = function(c) { |
|
if (c === "-") { |
|
this._state = IN_COMMENT; |
|
this._sectionStart = this._index + 1; |
|
} else { |
|
this._state = IN_DECLARATION; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInComment = function(c) { |
|
if (c === "-") this._state = AFTER_COMMENT_1; |
|
}; |
|
|
|
Tokenizer.prototype._stateAfterComment1 = function(c) { |
|
if (c === "-") { |
|
this._state = AFTER_COMMENT_2; |
|
} else { |
|
this._state = IN_COMMENT; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateAfterComment2 = function(c) { |
|
if (c === ">") { |
|
//remove 2 trailing chars |
|
this._cbs.oncomment( |
|
this._buffer.substring(this._sectionStart, this._index - 2) |
|
); |
|
this._state = TEXT; |
|
this._sectionStart = this._index + 1; |
|
} else if (c !== "-") { |
|
this._state = IN_COMMENT; |
|
} |
|
// else: stay in AFTER_COMMENT_2 (`--->`) |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeCdata1 = ifElseState( |
|
"C", |
|
BEFORE_CDATA_2, |
|
IN_DECLARATION |
|
); |
|
Tokenizer.prototype._stateBeforeCdata2 = ifElseState( |
|
"D", |
|
BEFORE_CDATA_3, |
|
IN_DECLARATION |
|
); |
|
Tokenizer.prototype._stateBeforeCdata3 = ifElseState( |
|
"A", |
|
BEFORE_CDATA_4, |
|
IN_DECLARATION |
|
); |
|
Tokenizer.prototype._stateBeforeCdata4 = ifElseState( |
|
"T", |
|
BEFORE_CDATA_5, |
|
IN_DECLARATION |
|
); |
|
Tokenizer.prototype._stateBeforeCdata5 = ifElseState( |
|
"A", |
|
BEFORE_CDATA_6, |
|
IN_DECLARATION |
|
); |
|
|
|
Tokenizer.prototype._stateBeforeCdata6 = function(c) { |
|
if (c === "[") { |
|
this._state = IN_CDATA; |
|
this._sectionStart = this._index + 1; |
|
} else { |
|
this._state = IN_DECLARATION; |
|
this._index--; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInCdata = function(c) { |
|
if (c === "]") this._state = AFTER_CDATA_1; |
|
}; |
|
|
|
Tokenizer.prototype._stateAfterCdata1 = function(c) { |
|
if (c === "]") this._state = AFTER_CDATA_2; |
|
else this._state = IN_CDATA; |
|
}; |
|
|
|
Tokenizer.prototype._stateAfterCdata2 = function(c) { |
|
if (c === ">") { |
|
//remove 2 trailing chars |
|
this._cbs.oncdata( |
|
this._buffer.substring(this._sectionStart, this._index - 2) |
|
); |
|
this._state = TEXT; |
|
this._sectionStart = this._index + 1; |
|
} else if (c !== "]") { |
|
this._state = IN_CDATA; |
|
} |
|
//else: stay in AFTER_CDATA_2 (`]]]>`) |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeSpecial = function(c) { |
|
if (c === "c" || c === "C") { |
|
this._state = BEFORE_SCRIPT_1; |
|
} else if (c === "t" || c === "T") { |
|
this._state = BEFORE_STYLE_1; |
|
} else { |
|
this._state = IN_TAG_NAME; |
|
this._index--; //consume the token again |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeSpecialEnd = function(c) { |
|
if (this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")) { |
|
this._state = AFTER_SCRIPT_1; |
|
} else if (this._special === SPECIAL_STYLE && (c === "t" || c === "T")) { |
|
this._state = AFTER_STYLE_1; |
|
} else this._state = TEXT; |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar( |
|
"R", |
|
BEFORE_SCRIPT_2 |
|
); |
|
Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar( |
|
"I", |
|
BEFORE_SCRIPT_3 |
|
); |
|
Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar( |
|
"P", |
|
BEFORE_SCRIPT_4 |
|
); |
|
Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar( |
|
"T", |
|
BEFORE_SCRIPT_5 |
|
); |
|
|
|
Tokenizer.prototype._stateBeforeScript5 = function(c) { |
|
if (c === "/" || c === ">" || whitespace(c)) { |
|
this._special = SPECIAL_SCRIPT; |
|
} |
|
this._state = IN_TAG_NAME; |
|
this._index--; //consume the token again |
|
}; |
|
|
|
Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT); |
|
Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT); |
|
Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT); |
|
Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT); |
|
|
|
Tokenizer.prototype._stateAfterScript5 = function(c) { |
|
if (c === ">" || whitespace(c)) { |
|
this._special = SPECIAL_NONE; |
|
this._state = IN_CLOSING_TAG_NAME; |
|
this._sectionStart = this._index - 6; |
|
this._index--; //reconsume the token |
|
} else this._state = TEXT; |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar( |
|
"Y", |
|
BEFORE_STYLE_2 |
|
); |
|
Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar( |
|
"L", |
|
BEFORE_STYLE_3 |
|
); |
|
Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar( |
|
"E", |
|
BEFORE_STYLE_4 |
|
); |
|
|
|
Tokenizer.prototype._stateBeforeStyle4 = function(c) { |
|
if (c === "/" || c === ">" || whitespace(c)) { |
|
this._special = SPECIAL_STYLE; |
|
} |
|
this._state = IN_TAG_NAME; |
|
this._index--; //consume the token again |
|
}; |
|
|
|
Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT); |
|
Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT); |
|
Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT); |
|
|
|
Tokenizer.prototype._stateAfterStyle4 = function(c) { |
|
if (c === ">" || whitespace(c)) { |
|
this._special = SPECIAL_NONE; |
|
this._state = IN_CLOSING_TAG_NAME; |
|
this._sectionStart = this._index - 5; |
|
this._index--; //reconsume the token |
|
} else this._state = TEXT; |
|
}; |
|
|
|
Tokenizer.prototype._stateBeforeEntity = ifElseState( |
|
"#", |
|
BEFORE_NUMERIC_ENTITY, |
|
IN_NAMED_ENTITY |
|
); |
|
Tokenizer.prototype._stateBeforeNumericEntity = ifElseState( |
|
"X", |
|
IN_HEX_ENTITY, |
|
IN_NUMERIC_ENTITY |
|
); |
|
|
|
//for entities terminated with a semicolon |
|
Tokenizer.prototype._parseNamedEntityStrict = function() { |
|
//offset = 1 |
|
if (this._sectionStart + 1 < this._index) { |
|
var entity = this._buffer.substring( |
|
this._sectionStart + 1, |
|
this._index |
|
), |
|
map = this._xmlMode ? xmlMap : entityMap; |
|
|
|
if (map.hasOwnProperty(entity)) { |
|
this._emitPartial(map[entity]); |
|
this._sectionStart = this._index + 1; |
|
} |
|
} |
|
}; |
|
|
|
//parses legacy entities (without trailing semicolon) |
|
Tokenizer.prototype._parseLegacyEntity = function() { |
|
var start = this._sectionStart + 1, |
|
limit = this._index - start; |
|
|
|
if (limit > 6) limit = 6; //the max length of legacy entities is 6 |
|
|
|
while (limit >= 2) { |
|
//the min length of legacy entities is 2 |
|
var entity = this._buffer.substr(start, limit); |
|
|
|
if (legacyMap.hasOwnProperty(entity)) { |
|
this._emitPartial(legacyMap[entity]); |
|
this._sectionStart += limit + 1; |
|
return; |
|
} else { |
|
limit--; |
|
} |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInNamedEntity = function(c) { |
|
if (c === ";") { |
|
this._parseNamedEntityStrict(); |
|
if (this._sectionStart + 1 < this._index && !this._xmlMode) { |
|
this._parseLegacyEntity(); |
|
} |
|
this._state = this._baseState; |
|
} else if ( |
|
(c < "a" || c > "z") && |
|
(c < "A" || c > "Z") && |
|
(c < "0" || c > "9") |
|
) { |
|
if (this._xmlMode); |
|
else if (this._sectionStart + 1 === this._index); |
|
else if (this._baseState !== TEXT) { |
|
if (c !== "=") { |
|
this._parseNamedEntityStrict(); |
|
} |
|
} else { |
|
this._parseLegacyEntity(); |
|
} |
|
|
|
this._state = this._baseState; |
|
this._index--; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._decodeNumericEntity = function(offset, base) { |
|
var sectionStart = this._sectionStart + offset; |
|
|
|
if (sectionStart !== this._index) { |
|
//parse entity |
|
var entity = this._buffer.substring(sectionStart, this._index); |
|
var parsed = parseInt(entity, base); |
|
|
|
this._emitPartial(decodeCodePoint(parsed)); |
|
this._sectionStart = this._index; |
|
} else { |
|
this._sectionStart--; |
|
} |
|
|
|
this._state = this._baseState; |
|
}; |
|
|
|
Tokenizer.prototype._stateInNumericEntity = function(c) { |
|
if (c === ";") { |
|
this._decodeNumericEntity(2, 10); |
|
this._sectionStart++; |
|
} else if (c < "0" || c > "9") { |
|
if (!this._xmlMode) { |
|
this._decodeNumericEntity(2, 10); |
|
} else { |
|
this._state = this._baseState; |
|
} |
|
this._index--; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._stateInHexEntity = function(c) { |
|
if (c === ";") { |
|
this._decodeNumericEntity(3, 16); |
|
this._sectionStart++; |
|
} else if ( |
|
(c < "a" || c > "f") && |
|
(c < "A" || c > "F") && |
|
(c < "0" || c > "9") |
|
) { |
|
if (!this._xmlMode) { |
|
this._decodeNumericEntity(3, 16); |
|
} else { |
|
this._state = this._baseState; |
|
} |
|
this._index--; |
|
} |
|
}; |
|
|
|
Tokenizer.prototype._cleanup = function() { |
|
if (this._sectionStart < 0) { |
|
this._buffer = ""; |
|
this._bufferOffset += this._index; |
|
this._index = 0; |
|
} else if (this._running) { |
|
if (this._state === TEXT) { |
|
if (this._sectionStart !== this._index) { |
|
this._cbs.ontext(this._buffer.substr(this._sectionStart)); |
|
} |
|
this._buffer = ""; |
|
this._bufferOffset += this._index; |
|
this._index = 0; |
|
} else if (this._sectionStart === this._index) { |
|
//the section just started |
|
this._buffer = ""; |
|
this._bufferOffset += this._index; |
|
this._index = 0; |
|
} else { |
|
//remove everything unnecessary |
|
this._buffer = this._buffer.substr(this._sectionStart); |
|
this._index -= this._sectionStart; |
|
this._bufferOffset += this._sectionStart; |
|
} |
|
|
|
this._sectionStart = 0; |
|
} |
|
}; |
|
|
|
//TODO make events conditional |
|
Tokenizer.prototype.write = function(chunk) { |
|
if (this._ended) this._cbs.onerror(Error(".write() after done!")); |
|
|
|
this._buffer += chunk; |
|
this._parse(); |
|
}; |
|
|
|
Tokenizer.prototype._parse = function() { |
|
while (this._index < this._buffer.length && this._running) { |
|
var c = this._buffer.charAt(this._index); |
|
if (this._state === TEXT) { |
|
this._stateText(c); |
|
} else if (this._state === BEFORE_TAG_NAME) { |
|
this._stateBeforeTagName(c); |
|
} else if (this._state === IN_TAG_NAME) { |
|
this._stateInTagName(c); |
|
} else if (this._state === BEFORE_CLOSING_TAG_NAME) { |
|
this._stateBeforeCloseingTagName(c); |
|
} else if (this._state === IN_CLOSING_TAG_NAME) { |
|
this._stateInCloseingTagName(c); |
|
} else if (this._state === AFTER_CLOSING_TAG_NAME) { |
|
this._stateAfterCloseingTagName(c); |
|
} else if (this._state === IN_SELF_CLOSING_TAG) { |
|
this._stateInSelfClosingTag(c); |
|
} else if (this._state === BEFORE_ATTRIBUTE_NAME) { |
|
|
|
/* |
|
* attributes |
|
*/ |
|
this._stateBeforeAttributeName(c); |
|
} else if (this._state === IN_ATTRIBUTE_NAME) { |
|
this._stateInAttributeName(c); |
|
} else if (this._state === AFTER_ATTRIBUTE_NAME) { |
|
this._stateAfterAttributeName(c); |
|
} else if (this._state === BEFORE_ATTRIBUTE_VALUE) { |
|
this._stateBeforeAttributeValue(c); |
|
} else if (this._state === IN_ATTRIBUTE_VALUE_DQ) { |
|
this._stateInAttributeValueDoubleQuotes(c); |
|
} else if (this._state === IN_ATTRIBUTE_VALUE_SQ) { |
|
this._stateInAttributeValueSingleQuotes(c); |
|
} else if (this._state === IN_ATTRIBUTE_VALUE_NQ) { |
|
this._stateInAttributeValueNoQuotes(c); |
|
} else if (this._state === BEFORE_DECLARATION) { |
|
|
|
/* |
|
* declarations |
|
*/ |
|
this._stateBeforeDeclaration(c); |
|
} else if (this._state === IN_DECLARATION) { |
|
this._stateInDeclaration(c); |
|
} else if (this._state === IN_PROCESSING_INSTRUCTION) { |
|
|
|
/* |
|
* processing instructions |
|
*/ |
|
this._stateInProcessingInstruction(c); |
|
} else if (this._state === BEFORE_COMMENT) { |
|
|
|
/* |
|
* comments |
|
*/ |
|
this._stateBeforeComment(c); |
|
} else if (this._state === IN_COMMENT) { |
|
this._stateInComment(c); |
|
} else if (this._state === AFTER_COMMENT_1) { |
|
this._stateAfterComment1(c); |
|
} else if (this._state === AFTER_COMMENT_2) { |
|
this._stateAfterComment2(c); |
|
} else if (this._state === BEFORE_CDATA_1) { |
|
|
|
/* |
|
* cdata |
|
*/ |
|
this._stateBeforeCdata1(c); |
|
} else if (this._state === BEFORE_CDATA_2) { |
|
this._stateBeforeCdata2(c); |
|
} else if (this._state === BEFORE_CDATA_3) { |
|
this._stateBeforeCdata3(c); |
|
} else if (this._state === BEFORE_CDATA_4) { |
|
this._stateBeforeCdata4(c); |
|
} else if (this._state === BEFORE_CDATA_5) { |
|
this._stateBeforeCdata5(c); |
|
} else if (this._state === BEFORE_CDATA_6) { |
|
this._stateBeforeCdata6(c); |
|
} else if (this._state === IN_CDATA) { |
|
this._stateInCdata(c); |
|
} else if (this._state === AFTER_CDATA_1) { |
|
this._stateAfterCdata1(c); |
|
} else if (this._state === AFTER_CDATA_2) { |
|
this._stateAfterCdata2(c); |
|
} else if (this._state === BEFORE_SPECIAL) { |
|
|
|
/* |
|
* special tags |
|
*/ |
|
this._stateBeforeSpecial(c); |
|
} else if (this._state === BEFORE_SPECIAL_END) { |
|
this._stateBeforeSpecialEnd(c); |
|
} else if (this._state === BEFORE_SCRIPT_1) { |
|
|
|
/* |
|
* script |
|
*/ |
|
this._stateBeforeScript1(c); |
|
} else if (this._state === BEFORE_SCRIPT_2) { |
|
this._stateBeforeScript2(c); |
|
} else if (this._state === BEFORE_SCRIPT_3) { |
|
this._stateBeforeScript3(c); |
|
} else if (this._state === BEFORE_SCRIPT_4) { |
|
this._stateBeforeScript4(c); |
|
} else if (this._state === BEFORE_SCRIPT_5) { |
|
this._stateBeforeScript5(c); |
|
} else if (this._state === AFTER_SCRIPT_1) { |
|
this._stateAfterScript1(c); |
|
} else if (this._state === AFTER_SCRIPT_2) { |
|
this._stateAfterScript2(c); |
|
} else if (this._state === AFTER_SCRIPT_3) { |
|
this._stateAfterScript3(c); |
|
} else if (this._state === AFTER_SCRIPT_4) { |
|
this._stateAfterScript4(c); |
|
} else if (this._state === AFTER_SCRIPT_5) { |
|
this._stateAfterScript5(c); |
|
} else if (this._state === BEFORE_STYLE_1) { |
|
|
|
/* |
|
* style |
|
*/ |
|
this._stateBeforeStyle1(c); |
|
} else if (this._state === BEFORE_STYLE_2) { |
|
this._stateBeforeStyle2(c); |
|
} else if (this._state === BEFORE_STYLE_3) { |
|
this._stateBeforeStyle3(c); |
|
} else if (this._state === BEFORE_STYLE_4) { |
|
this._stateBeforeStyle4(c); |
|
} else if (this._state === AFTER_STYLE_1) { |
|
this._stateAfterStyle1(c); |
|
} else if (this._state === AFTER_STYLE_2) { |
|
this._stateAfterStyle2(c); |
|
} else if (this._state === AFTER_STYLE_3) { |
|
this._stateAfterStyle3(c); |
|
} else if (this._state === AFTER_STYLE_4) { |
|
this._stateAfterStyle4(c); |
|
} else if (this._state === BEFORE_ENTITY) { |
|
|
|
/* |
|
* entities |
|
*/ |
|
this._stateBeforeEntity(c); |
|
} else if (this._state === BEFORE_NUMERIC_ENTITY) { |
|
this._stateBeforeNumericEntity(c); |
|
} else if (this._state === IN_NAMED_ENTITY) { |
|
this._stateInNamedEntity(c); |
|
} else if (this._state === IN_NUMERIC_ENTITY) { |
|
this._stateInNumericEntity(c); |
|
} else if (this._state === IN_HEX_ENTITY) { |
|
this._stateInHexEntity(c); |
|
} else { |
|
this._cbs.onerror(Error("unknown _state"), this._state); |
|
} |
|
|
|
this._index++; |
|
} |
|
|
|
this._cleanup(); |
|
}; |
|
|
|
Tokenizer.prototype.pause = function() { |
|
this._running = false; |
|
}; |
|
Tokenizer.prototype.resume = function() { |
|
this._running = true; |
|
|
|
if (this._index < this._buffer.length) { |
|
this._parse(); |
|
} |
|
if (this._ended) { |
|
this._finish(); |
|
} |
|
}; |
|
|
|
Tokenizer.prototype.end = function(chunk) { |
|
if (this._ended) this._cbs.onerror(Error(".end() after done!")); |
|
if (chunk) this.write(chunk); |
|
|
|
this._ended = true; |
|
|
|
if (this._running) this._finish(); |
|
}; |
|
|
|
Tokenizer.prototype._finish = function() { |
|
//if there is remaining data, emit it in a reasonable way |
|
if (this._sectionStart < this._index) { |
|
this._handleTrailingData(); |
|
} |
|
|
|
this._cbs.onend(); |
|
}; |
|
|
|
Tokenizer.prototype._handleTrailingData = function() { |
|
var data = this._buffer.substr(this._sectionStart); |
|
|
|
if ( |
|
this._state === IN_CDATA || |
|
this._state === AFTER_CDATA_1 || |
|
this._state === AFTER_CDATA_2 |
|
) { |
|
this._cbs.oncdata(data); |
|
} else if ( |
|
this._state === IN_COMMENT || |
|
this._state === AFTER_COMMENT_1 || |
|
this._state === AFTER_COMMENT_2 |
|
) { |
|
this._cbs.oncomment(data); |
|
} else if (this._state === IN_NAMED_ENTITY && !this._xmlMode) { |
|
this._parseLegacyEntity(); |
|
if (this._sectionStart < this._index) { |
|
this._state = this._baseState; |
|
this._handleTrailingData(); |
|
} |
|
} else if (this._state === IN_NUMERIC_ENTITY && !this._xmlMode) { |
|
this._decodeNumericEntity(2, 10); |
|
if (this._sectionStart < this._index) { |
|
this._state = this._baseState; |
|
this._handleTrailingData(); |
|
} |
|
} else if (this._state === IN_HEX_ENTITY && !this._xmlMode) { |
|
this._decodeNumericEntity(3, 16); |
|
if (this._sectionStart < this._index) { |
|
this._state = this._baseState; |
|
this._handleTrailingData(); |
|
} |
|
} else if ( |
|
this._state !== IN_TAG_NAME && |
|
this._state !== BEFORE_ATTRIBUTE_NAME && |
|
this._state !== BEFORE_ATTRIBUTE_VALUE && |
|
this._state !== AFTER_ATTRIBUTE_NAME && |
|
this._state !== IN_ATTRIBUTE_NAME && |
|
this._state !== IN_ATTRIBUTE_VALUE_SQ && |
|
this._state !== IN_ATTRIBUTE_VALUE_DQ && |
|
this._state !== IN_ATTRIBUTE_VALUE_NQ && |
|
this._state !== IN_CLOSING_TAG_NAME |
|
) { |
|
this._cbs.ontext(data); |
|
} |
|
//else, ignore remaining data |
|
//TODO add a way to remove current tag |
|
}; |
|
|
|
Tokenizer.prototype.reset = function() { |
|
Tokenizer.call( |
|
this, |
|
{ xmlMode: this._xmlMode, decodeEntities: this._decodeEntities }, |
|
this._cbs |
|
); |
|
}; |
|
|
|
Tokenizer.prototype.getAbsoluteIndex = function() { |
|
return this._bufferOffset + this._index; |
|
}; |
|
|
|
Tokenizer.prototype._getSection = function() { |
|
return this._buffer.substring(this._sectionStart, this._index); |
|
}; |
|
|
|
Tokenizer.prototype._emitToken = function(name) { |
|
this._cbs[name](this._getSection()); |
|
this._sectionStart = -1; |
|
}; |
|
|
|
Tokenizer.prototype._emitPartial = function(value) { |
|
if (this._baseState !== TEXT) { |
|
this._cbs.onattribdata(value); //TODO implement the new event |
|
} else { |
|
this._cbs.ontext(value); |
|
} |
|
};
|
|
|