You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
399 lines
10 KiB
399 lines
10 KiB
export default createTokenizer |
|
|
|
import assert from 'assert' |
|
import createDebug from 'debug' |
|
import assign from '../constant/assign.mjs' |
|
import codes from '../character/codes.mjs' |
|
import markdownLineEnding from '../character/markdown-line-ending.mjs' |
|
import chunkedPush from './chunked-push.mjs' |
|
import chunkedSplice from './chunked-splice.mjs' |
|
import miniflat from './miniflat.mjs' |
|
import resolveAll from './resolve-all.mjs' |
|
import serializeChunks from './serialize-chunks.mjs' |
|
import shallow from './shallow.mjs' |
|
import sliceChunks from './slice-chunks.mjs' |
|
|
|
var debug = createDebug('micromark') |
|
|
|
// Create a tokenizer. |
|
// Tokenizers deal with one type of data (e.g., containers, flow, text). |
|
// The parser is the object dealing with it all. |
|
// `initialize` works like other constructs, except that only its `tokenize` |
|
// function is used, in which case it doesn’t receive an `ok` or `nok`. |
|
// `from` can be given to set the point before the first character, although |
|
// when further lines are indented, they must be set with `defineSkip`. |
|
function createTokenizer(parser, initialize, from) { |
|
var point = from ? shallow(from) : {line: 1, column: 1, offset: 0} |
|
var columnStart = {} |
|
var resolveAllConstructs = [] |
|
var chunks = [] |
|
var stack = [] |
|
var consumed = true |
|
|
|
// Tools used for tokenizing. |
|
var effects = { |
|
consume: consume, |
|
enter: enter, |
|
exit: exit, |
|
attempt: constructFactory(onsuccessfulconstruct), |
|
check: constructFactory(onsuccessfulcheck), |
|
interrupt: constructFactory(onsuccessfulcheck, {interrupt: true}), |
|
lazy: constructFactory(onsuccessfulcheck, {lazy: true}) |
|
} |
|
|
|
// State and tools for resolving and serializing. |
|
var context = { |
|
previous: codes.eof, |
|
events: [], |
|
parser: parser, |
|
sliceStream: sliceStream, |
|
sliceSerialize: sliceSerialize, |
|
now: now, |
|
defineSkip: skip, |
|
write: write |
|
} |
|
|
|
// The state function. |
|
var state = initialize.tokenize.call(context, effects) |
|
|
|
// Track which character we expect to be consumed, to catch bugs. |
|
var expectedCode |
|
|
|
if (initialize.resolveAll) { |
|
resolveAllConstructs.push(initialize) |
|
} |
|
|
|
// Store where we are in the input stream. |
|
point._index = 0 |
|
point._bufferIndex = -1 |
|
|
|
return context |
|
|
|
function write(slice) { |
|
chunks = chunkedPush(chunks, slice) |
|
|
|
main() |
|
|
|
// Exit if we’re not done, resolve might change stuff. |
|
if (chunks[chunks.length - 1] !== codes.eof) { |
|
return [] |
|
} |
|
|
|
addResult(initialize, 0) |
|
|
|
// Otherwise, resolve, and exit. |
|
context.events = resolveAll(resolveAllConstructs, context.events, context) |
|
|
|
return context.events |
|
} |
|
|
|
// |
|
// Tools. |
|
// |
|
|
|
function sliceSerialize(token) { |
|
return serializeChunks(sliceStream(token)) |
|
} |
|
|
|
function sliceStream(token) { |
|
return sliceChunks(chunks, token) |
|
} |
|
|
|
function now() { |
|
return shallow(point) |
|
} |
|
|
|
function skip(value) { |
|
columnStart[value.line] = value.column |
|
accountForPotentialSkip() |
|
debug('position: define skip: `%j`', point) |
|
} |
|
|
|
// |
|
// State management. |
|
// |
|
|
|
// Main loop (note that `_index` and `_bufferIndex` in `point` are modified by |
|
// `consume`). |
|
// Here is where we walk through the chunks, which either include strings of |
|
// several characters, or numerical character codes. |
|
// The reason to do this in a loop instead of a call is so the stack can |
|
// drain. |
|
function main() { |
|
var chunkIndex |
|
var chunk |
|
|
|
while (point._index < chunks.length) { |
|
chunk = chunks[point._index] |
|
|
|
// If we’re in a buffer chunk, loop through it. |
|
if (typeof chunk === 'string') { |
|
chunkIndex = point._index |
|
|
|
if (point._bufferIndex < 0) { |
|
point._bufferIndex = 0 |
|
} |
|
|
|
while ( |
|
point._index === chunkIndex && |
|
point._bufferIndex < chunk.length |
|
) { |
|
go(chunk.charCodeAt(point._bufferIndex)) |
|
} |
|
} else { |
|
go(chunk) |
|
} |
|
} |
|
} |
|
|
|
// Deal with one code. |
|
function go(code) { |
|
assert.equal(consumed, true, 'expected character to be consumed') |
|
consumed = undefined |
|
debug('main: passing `%s` to %s', code, state.name) |
|
expectedCode = code |
|
state = state(code) |
|
} |
|
|
|
// Move a character forward. |
|
function consume(code) { |
|
assert.equal( |
|
code, |
|
expectedCode, |
|
'expected given code to equal expected code' |
|
) |
|
|
|
debug('consume: `%s`', code) |
|
|
|
assert.equal(consumed, undefined, 'expected code to not have been consumed') |
|
assert( |
|
code === null |
|
? !context.events.length || |
|
context.events[context.events.length - 1][0] === 'exit' |
|
: context.events[context.events.length - 1][0] === 'enter', |
|
'expected last token to be open' |
|
) |
|
|
|
if (markdownLineEnding(code)) { |
|
point.line++ |
|
point.column = 1 |
|
point.offset += code === codes.carriageReturnLineFeed ? 2 : 1 |
|
accountForPotentialSkip() |
|
debug('position: after eol: `%j`', point) |
|
} else if (code !== codes.virtualSpace) { |
|
point.column++ |
|
point.offset++ |
|
} |
|
|
|
// Not in a string chunk. |
|
if (point._bufferIndex < 0) { |
|
point._index++ |
|
} else { |
|
point._bufferIndex++ |
|
|
|
// At end of string chunk. |
|
if (point._bufferIndex === chunks[point._index].length) { |
|
point._bufferIndex = -1 |
|
point._index++ |
|
} |
|
} |
|
|
|
// Expose the previous character. |
|
context.previous = code |
|
|
|
// Mark as consumed. |
|
consumed = true |
|
} |
|
|
|
// Start a token. |
|
function enter(type, fields) { |
|
var token = fields || {} |
|
token.type = type |
|
token.start = now() |
|
|
|
assert.equal(typeof type, 'string', 'expected string type') |
|
assert.notEqual(type.length, 0, 'expected non-empty string') |
|
debug('enter: `%s`', type) |
|
|
|
context.events.push(['enter', token, context]) |
|
|
|
stack.push(token) |
|
|
|
return token |
|
} |
|
|
|
// Stop a token. |
|
function exit(type) { |
|
assert.equal(typeof type, 'string', 'expected string type') |
|
assert.notEqual(type.length, 0, 'expected non-empty string') |
|
assert.notEqual(stack.length, 0, 'cannot close w/o open tokens') |
|
|
|
var token = stack.pop() |
|
token.end = now() |
|
|
|
assert.equal(type, token.type, 'expected exit token to match current token') |
|
|
|
assert( |
|
!( |
|
token.start._index === token.end._index && |
|
token.start._bufferIndex === token.end._bufferIndex |
|
), |
|
'expected non-empty token (`' + type + '`)' |
|
) |
|
|
|
debug('exit: `%s`', token.type) |
|
context.events.push(['exit', token, context]) |
|
|
|
return token |
|
} |
|
|
|
// Use results. |
|
function onsuccessfulconstruct(construct, info) { |
|
addResult(construct, info.from) |
|
} |
|
|
|
// Discard results. |
|
function onsuccessfulcheck(construct, info) { |
|
info.restore() |
|
} |
|
|
|
// Factory to attempt/check/interrupt. |
|
function constructFactory(onreturn, fields) { |
|
return hook |
|
|
|
// Handle either an object mapping codes to constructs, a list of |
|
// constructs, or a single construct. |
|
function hook(constructs, returnState, bogusState) { |
|
var listOfConstructs |
|
var constructIndex |
|
var currentConstruct |
|
var info |
|
|
|
return constructs.tokenize || 'length' in constructs |
|
? handleListOfConstructs(miniflat(constructs)) |
|
: handleMapOfConstructs |
|
|
|
function handleMapOfConstructs(code) { |
|
if (code in constructs || codes.eof in constructs) { |
|
return handleListOfConstructs( |
|
constructs.null |
|
? /* c8 ignore next */ |
|
miniflat(constructs[code]).concat(miniflat(constructs.null)) |
|
: constructs[code] |
|
)(code) |
|
} |
|
|
|
return bogusState(code) |
|
} |
|
|
|
function handleListOfConstructs(list) { |
|
listOfConstructs = list |
|
constructIndex = 0 |
|
return handleConstruct(list[constructIndex]) |
|
} |
|
|
|
function handleConstruct(construct) { |
|
return start |
|
|
|
function start(code) { |
|
// To do: not nede to store if there is no bogus state, probably? |
|
// Currently doesn’t work because `inspect` in document does a check |
|
// w/o a bogus, which doesn’t make sense. But it does seem to help perf |
|
// by not storing. |
|
info = store() |
|
currentConstruct = construct |
|
|
|
if (!construct.partial) { |
|
context.currentConstruct = construct |
|
} |
|
|
|
if ( |
|
construct.name && |
|
context.parser.constructs.disable.null.indexOf(construct.name) > -1 |
|
) { |
|
return nok(code) |
|
} |
|
|
|
return construct.tokenize.call( |
|
fields ? assign({}, context, fields) : context, |
|
effects, |
|
ok, |
|
nok |
|
)(code) |
|
} |
|
} |
|
|
|
function ok(code) { |
|
assert.equal(code, expectedCode, 'expected code') |
|
consumed = true |
|
onreturn(currentConstruct, info) |
|
return returnState |
|
} |
|
|
|
function nok(code) { |
|
assert.equal(code, expectedCode, 'expected code') |
|
consumed = true |
|
info.restore() |
|
|
|
if (++constructIndex < listOfConstructs.length) { |
|
return handleConstruct(listOfConstructs[constructIndex]) |
|
} |
|
|
|
return bogusState |
|
} |
|
} |
|
} |
|
|
|
function addResult(construct, from) { |
|
if (construct.resolveAll && resolveAllConstructs.indexOf(construct) < 0) { |
|
resolveAllConstructs.push(construct) |
|
} |
|
|
|
if (construct.resolve) { |
|
chunkedSplice( |
|
context.events, |
|
from, |
|
context.events.length - from, |
|
construct.resolve(context.events.slice(from), context) |
|
) |
|
} |
|
|
|
if (construct.resolveTo) { |
|
context.events = construct.resolveTo(context.events, context) |
|
} |
|
|
|
assert( |
|
construct.partial || |
|
!context.events.length || |
|
context.events[context.events.length - 1][0] === 'exit', |
|
'expected last token to end' |
|
) |
|
} |
|
|
|
function store() { |
|
var startPoint = now() |
|
var startPrevious = context.previous |
|
var startCurrentConstruct = context.currentConstruct |
|
var startEventsIndex = context.events.length |
|
var startStack = Array.from(stack) |
|
|
|
return {restore: restore, from: startEventsIndex} |
|
|
|
function restore() { |
|
point = startPoint |
|
context.previous = startPrevious |
|
context.currentConstruct = startCurrentConstruct |
|
context.events.length = startEventsIndex |
|
stack = startStack |
|
accountForPotentialSkip() |
|
debug('position: restore: `%j`', point) |
|
} |
|
} |
|
|
|
function accountForPotentialSkip() { |
|
if (point.line in columnStart && point.column < 2) { |
|
point.column = columnStart[point.line] |
|
point.offset += columnStart[point.line] - 1 |
|
} |
|
} |
|
}
|
|
|