* @default true
*/
decodeEntities?: boolean
-
- /**
- * If set to true, CDATA sections will be recognized as text even if the xmlMode option is not enabled.
- * NOTE: If xmlMode is set to `true` then CDATA sections will always be recognized as text.
- *
- * @default false
- */
- recognizeCDATA?: boolean
-
- /**
- * If set to `true`, self-closing tags will trigger the onclosetag event even if xmlMode is not set to `true`.
- * NOTE: If xmlMode is set to `true` then self-closing tags will always be recognized.
- *
- * @default false
- */
- recognizeSelfClosing?: boolean
}
export interface Handler {
/** Determines whether self-closing tags are recognized. */
private readonly foreignContext: boolean[]
private readonly cbs: Partial<Handler>
- private readonly recognizeSelfClosing: boolean
private readonly tokenizer: Tokenizer
private buffer: string = ''
private readonly options: ParserOptions = {}
) {
this.cbs = cbs ?? {}
- this.recognizeSelfClosing = options.recognizeSelfClosing ?? false
this.tokenizer = new Tokenizer(this.options, this)
this.foreignContext = [false]
this.cbs.onparserinit?.(this)
/** @internal */
onselfclosingtag(endIndex: number): void {
this.endIndex = endIndex
- if (this.recognizeSelfClosing || this.foreignContext[0]) {
- this.closeCurrentTag(false)
-
- // Set `startIndex` for next node
- this.startIndex = endIndex + 1
- } else {
- // Ignore the fact that the tag is self-closing.
- this.onopentagend(endIndex)
- }
+ this.closeCurrentTag(false)
+ // Set `startIndex` for next node
+ this.startIndex = endIndex + 1
}
private closeCurrentTag(isOpenImplied: boolean) {
/** @internal */
oncdata(start: number, endIndex: number, offset: number): void {
this.endIndex = endIndex
- const value = this.getSlice(start, endIndex - offset)
-
- if (this.options.recognizeCDATA) {
- this.cbs.oncdatastart?.()
- this.cbs.ontext?.(value)
- this.cbs.oncdataend?.()
- } else {
- this.cbs.oncomment?.(`[CDATA[${value}]]`)
- this.cbs.oncommentend?.()
- }
-
+ this.cbs.oncdatastart?.()
+ this.cbs.ontext?.(this.getSlice(start, endIndex - offset))
+ this.cbs.oncdataend?.()
// Set `startIndex` for next node
this.startIndex = endIndex + 1
}
public parse(input: string): void {
this.reset()
this.buffer = input
- this.tokenizer.write(input)
- this.tokenizer.end()
+ this.tokenizer.parse(input)
}
/**
+/**
+ * This Tokenizer is adapted from htmlparser2 under the MIT License listed at
+ * https://github.com/fb55/htmlparser2/blob/master/LICENSE
+
+Copyright 2010, 2011, Chris Winberry <chris@winberry.net>. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+ */
+
import {
EntityDecoder,
DecodingMode,
private baseState = State.Text
/** For special parsing behavior inside of script and style tags. */
private isSpecial = false
- /** Indicates whether the tokenizer has been paused. */
- public running = true
- /** The offset of the current buffer. */
- private offset = 0
private readonly decodeEntities: boolean
private readonly entityDecoder: EntityDecoder
this.index = 0
this.baseState = State.Text
this.currentSequence = undefined!
- this.running = true
- this.offset = 0
- }
-
- public write(chunk: string): void {
- this.offset += this.buffer.length
- this.buffer = chunk
- this.parse()
- }
-
- public end(): void {
- if (this.running) this.finish()
- }
-
- public pause(): void {
- this.running = false
- }
-
- public resume(): void {
- this.running = true
- if (this.index < this.buffer.length + this.offset) {
- this.parse()
- }
}
private stateText(c: number): void {
* @returns Whether the character was found.
*/
private fastForwardTo(c: number): boolean {
- while (++this.index < this.buffer.length + this.offset) {
- if (this.buffer.charCodeAt(this.index - this.offset) === c) {
+ while (++this.index < this.buffer.length) {
+ if (this.buffer.charCodeAt(this.index) === c) {
return true
}
}
*
* TODO: Refactor `parse` to increment index before calling states.
*/
- this.index = this.buffer.length + this.offset - 1
+ this.index = this.buffer.length - 1
return false
}
}
private stateInEntity(): void {
- const length = this.entityDecoder.write(
- this.buffer,
- this.index - this.offset
- )
+ const length = this.entityDecoder.write(this.buffer, this.index)
// If `length` is positive, we are done with the entity.
if (length >= 0) {
}
} else {
// Mark buffer as consumed.
- this.index = this.offset + this.buffer.length - 1
+ this.index = this.buffer.length - 1
}
}
- /**
- * Remove data that has already been consumed from the buffer.
- */
- private cleanup() {
- // If we are inside of text or attributes, emit what we already have.
- if (this.running && this.sectionStart !== this.index) {
- if (
- this.state === State.Text ||
- (this.state === State.InSpecialTag && this.sequenceIndex === 0)
- ) {
- this.cbs.ontext(this.sectionStart, this.index)
- this.sectionStart = this.index
- } else if (
- this.state === State.InAttributeValueDq ||
- this.state === State.InAttributeValueSq ||
- this.state === State.InAttributeValueNq
- ) {
- this.cbs.onattribdata(this.sectionStart, this.index)
- this.sectionStart = this.index
- }
- }
- }
-
- private shouldContinue() {
- return this.index < this.buffer.length + this.offset && this.running
- }
-
/**
* Iterates through the buffer, calling the function corresponding to the current state.
*
* States that are more likely to be hit are higher up, as a performance improvement.
*/
- private parse() {
- while (this.shouldContinue()) {
- const c = this.buffer.charCodeAt(this.index - this.offset)
+ public parse(input: string) {
+ this.buffer = input
+ while (this.index < this.buffer.length) {
+ const c = this.buffer.charCodeAt(this.index)
switch (this.state) {
case State.Text: {
this.stateText(c)
this.index++
}
this.cleanup()
+ this.finish()
+ }
+
+ /**
+ * Remove data that has already been consumed from the buffer.
+ */
+ private cleanup() {
+ // If we are inside of text or attributes, emit what we already have.
+ if (this.sectionStart !== this.index) {
+ if (
+ this.state === State.Text ||
+ (this.state === State.InSpecialTag && this.sequenceIndex === 0)
+ ) {
+ this.cbs.ontext(this.sectionStart, this.index)
+ this.sectionStart = this.index
+ } else if (
+ this.state === State.InAttributeValueDq ||
+ this.state === State.InAttributeValueSq ||
+ this.state === State.InAttributeValueNq
+ ) {
+ this.cbs.onattribdata(this.sectionStart, this.index)
+ this.sectionStart = this.index
+ }
+ }
}
private finish() {
/** Handle any trailing data. */
private handleTrailingData() {
- const endIndex = this.buffer.length + this.offset
+ const endIndex = this.buffer.length
// If there is no remaining data, we are done.
if (this.sectionStart >= endIndex) {
-import { RootNode, createRoot } from '../ast'
+import { fromCodePoint } from 'entities/lib/decode.js'
+import {
+ ElementNode,
+ ElementTypes,
+ NodeTypes,
+ RootNode,
+ TemplateChildNode,
+ createRoot
+} from '../ast'
import { ParserOptions } from '../options'
-import { Parser } from './Parser'
+import Tokenizer from './Tokenizer'
+import { hasOwn } from '@vue/shared'
-const parser = new Parser({
- // TODO
-})
+const formTags = new Set([
+ 'input',
+ 'option',
+ 'optgroup',
+ 'select',
+ 'button',
+ 'datalist',
+ 'textarea'
+])
+const pTag = new Set(['p'])
+const tableSectionTags = new Set(['thead', 'tbody'])
+const ddtTags = new Set(['dd', 'dt'])
+const rtpTags = new Set(['rt', 'rp'])
+
+const openImpliesClose = new Map<string, Set<string>>([
+ ['tr', new Set(['tr', 'th', 'td'])],
+ ['th', new Set(['th'])],
+ ['td', new Set(['thead', 'th', 'td'])],
+ ['body', new Set(['head', 'link', 'script'])],
+ ['li', new Set(['li'])],
+ ['p', pTag],
+ ['h1', pTag],
+ ['h2', pTag],
+ ['h3', pTag],
+ ['h4', pTag],
+ ['h5', pTag],
+ ['h6', pTag],
+ ['select', formTags],
+ ['input', formTags],
+ ['output', formTags],
+ ['button', formTags],
+ ['datalist', formTags],
+ ['textarea', formTags],
+ ['option', new Set(['option'])],
+ ['optgroup', new Set(['optgroup', 'option'])],
+ ['dd', ddtTags],
+ ['dt', ddtTags],
+ ['address', pTag],
+ ['article', pTag],
+ ['aside', pTag],
+ ['blockquote', pTag],
+ ['details', pTag],
+ ['div', pTag],
+ ['dl', pTag],
+ ['fieldset', pTag],
+ ['figcaption', pTag],
+ ['figure', pTag],
+ ['footer', pTag],
+ ['form', pTag],
+ ['header', pTag],
+ ['hr', pTag],
+ ['main', pTag],
+ ['nav', pTag],
+ ['ol', pTag],
+ ['pre', pTag],
+ ['section', pTag],
+ ['table', pTag],
+ ['ul', pTag],
+ ['rt', rtpTags],
+ ['rp', rtpTags],
+ ['tbody', tableSectionTags],
+ ['tfoot', tableSectionTags]
+])
+
+const voidElements = new Set([
+ 'area',
+ 'base',
+ 'basefont',
+ 'br',
+ 'col',
+ 'command',
+ 'embed',
+ 'frame',
+ 'hr',
+ 'img',
+ 'input',
+ 'isindex',
+ 'keygen',
+ 'link',
+ 'meta',
+ 'param',
+ 'source',
+ 'track',
+ 'wbr'
+])
+
+const foreignContextElements = new Set(['math', 'svg'])
+
+const htmlIntegrationElements = new Set([
+ 'mi',
+ 'mo',
+ 'mn',
+ 'ms',
+ 'mtext',
+ 'annotation-xml',
+ 'foreignobject',
+ 'desc',
+ 'title'
+])
+
+let currentOptions: ParserOptions = {}
+let currentRoot: RootNode = createRoot([])
+let elementStack: ElementNode[] = []
+
+// parser state
+let htmlMode = false
+let currentInput = ''
+let openTagStart = 0
+let tagname = ''
+let attribname = ''
+let attribvalue = ''
+let attribs: Record<string, string> | null = null
+let startIndex = 0
+let endIndex = 0
+let inPre = 0
+// let inVPre = 0
+const stack: string[] = []
+const foreignContext: boolean[] = [false]
+
+const tokenizer = new Tokenizer(
+ // TODO handle entities
+ { decodeEntities: true },
+ {
+ ontext(start, end) {
+ const content = getSlice(start, end)
+ endIndex = end - 1
+ onText(content)
+ startIndex = end
+ },
+
+ ontextentity(cp, end) {
+ endIndex = end - 1
+ onText(fromCodePoint(cp))
+ startIndex = end
+ },
+
+ onopentagname(start, end) {
+ emitOpenTag(getSlice(start, (endIndex = end)))
+ },
+
+ onopentagend(end) {
+ endIndex = end
+ endOpenTag(false)
+ startIndex = end + 1
+ },
+
+ onclosetag(start, end) {
+ endIndex = end
+ const name = getSlice(start, end)
+
+ if (
+ htmlMode &&
+ (foreignContextElements.has(name) || htmlIntegrationElements.has(name))
+ ) {
+ foreignContext.shift()
+ }
+
+ if (!voidElements.has(name)) {
+ const pos = stack.indexOf(name)
+ if (pos !== -1) {
+ for (let index = 0; index <= pos; index++) {
+ stack.shift()
+ onCloseTag()
+ }
+ } else if (htmlMode && name === 'p') {
+ // Implicit open before close
+ emitOpenTag('p')
+ closeCurrentTag(true)
+ }
+ } else if (htmlMode && name === 'br') {
+ // TODO
+ // We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed.
+ // this.cbs.onopentag?.('br', {}, true)
+ // this.cbs.onclosetag?.('br', false)
+ }
+
+ // Set `startIndex` for next node
+ startIndex = end + 1
+ },
+
+ onselfclosingtag(end) {
+ endIndex = end
+ closeCurrentTag(false)
+ startIndex = end + 1
+ },
+
+ onattribname(start, end) {
+ attribname = getSlice((startIndex = start), end)
+ },
+ onattribdata(start, end) {
+ attribvalue += getSlice(start, end)
+ },
+ onattribentity(codepoint) {
+ attribvalue += fromCodePoint(codepoint)
+ },
+ onattribend(quote, end) {
+ endIndex = end
+ if (attribs && !hasOwn(attribs, attribname)) {
+ // TODO gen attributes AST nodes
+ attribs[attribname] = attribvalue
+ }
+ attribvalue = ''
+ },
+
+ oncomment(start, end, offset) {
+ endIndex = end
+ // TODO oncomment
+ startIndex = end + 1
+ },
+
+ onend() {
+ // Set the end index for all remaining tags
+ endIndex = startIndex
+ for (let index = 0; index < stack.length; index++) {
+ onCloseTag()
+ }
+ },
+
+ oncdata(start, end, offset) {
+ endIndex = end
+ // TODO throw error
+ startIndex = end + 1
+ },
+
+ // TODO ignore
+ ondeclaration(start, end) {
+ endIndex = end
+ // TODO onprocessinginstruction
+ startIndex = end + 1
+ },
+
+ // TODO ignore
+ onprocessinginstruction(start, end) {
+ endIndex = end
+ // TODO onprocessinginstruction
+ startIndex = end + 1
+ }
+ }
+)
+
+function getSlice(start: number, end: number) {
+ return currentInput.slice(start, end)
+}
+
+function emitOpenTag(name: string) {
+ openTagStart = startIndex
+ tagname = name
+ const impliesClose = htmlMode && openImpliesClose.get(name)
+ if (impliesClose) {
+ while (stack.length > 0 && impliesClose.has(stack[0])) {
+ stack.shift()
+ onCloseTag()
+ }
+ }
+ if (!voidElements.has(name)) {
+ stack.unshift(name)
+ if (htmlMode) {
+ if (foreignContextElements.has(name)) {
+ foreignContext.unshift(true)
+ } else if (htmlIntegrationElements.has(name)) {
+ foreignContext.unshift(false)
+ }
+ }
+ }
+ attribs = {}
+}
+
+function closeCurrentTag(isOpenImplied: boolean) {
+ const name = tagname
+ endOpenTag(isOpenImplied)
+ if (stack[0] === name) {
+ onCloseTag()
+ stack.shift()
+ }
+}
+
+function endOpenTag(isImplied: boolean) {
+ startIndex = openTagStart
+ if (attribs) {
+ onOpenTag(tagname)
+ attribs = null
+ }
+ if (voidElements.has(tagname)) {
+ onCloseTag()
+ }
+ tagname = ''
+}
+
+function onText(content: string) {
+ const parent = getParent()
+ const lastNode = parent.children[parent.children.length - 1]
+ if (lastNode?.type === NodeTypes.TEXT) {
+ // merge
+ lastNode.content += content
+ // TODO update loc
+ } else {
+ parent.children.push({
+ type: NodeTypes.TEXT,
+ content,
+ // @ts-ignore TODO
+ loc: {}
+ })
+ }
+}
+
+function onOpenTag(tag: string) {
+ const el: ElementNode = {
+ type: NodeTypes.ELEMENT,
+ tag,
+ // TODO namespace
+ ns: 0,
+ // TODO refine tag type
+ tagType: ElementTypes.ELEMENT,
+ // TODO props
+ props: [],
+ children: [],
+ // @ts-ignore TODO
+ loc: {},
+ codegenNode: undefined
+ }
+ addNode(el)
+ elementStack.push(el)
+}
+
+function onCloseTag() {
+ const el = elementStack.pop()!
+ // whitepsace management
+ const nodes = el.children
+ const shouldCondense = currentOptions.whitespace !== 'preserve'
+ let removedWhitespace = false
+ for (let i = 0; i < nodes.length; i++) {
+ const node = nodes[i]
+ if (node.type === NodeTypes.TEXT) {
+ if (!inPre) {
+ if (!/[^\t\r\n\f ]/.test(node.content)) {
+ const prev = nodes[i - 1]
+ const next = nodes[i + 1]
+ // Remove if:
+ // - the whitespace is the first or last node, or:
+ // - (condense mode) the whitespace is between twos comments, or:
+ // - (condense mode) the whitespace is between comment and element, or:
+ // - (condense mode) the whitespace is between two elements AND contains newline
+ if (
+ !prev ||
+ !next ||
+ (shouldCondense &&
+ ((prev.type === NodeTypes.COMMENT &&
+ next.type === NodeTypes.COMMENT) ||
+ (prev.type === NodeTypes.COMMENT &&
+ next.type === NodeTypes.ELEMENT) ||
+ (prev.type === NodeTypes.ELEMENT &&
+ next.type === NodeTypes.COMMENT) ||
+ (prev.type === NodeTypes.ELEMENT &&
+ next.type === NodeTypes.ELEMENT &&
+ /[\r\n]/.test(node.content))))
+ ) {
+ removedWhitespace = true
+ nodes[i] = null as any
+ } else {
+ // Otherwise, the whitespace is condensed into a single space
+ node.content = ' '
+ }
+ } else if (shouldCondense) {
+ // in condense mode, consecutive whitespaces in text are condensed
+ // down to a single space.
+ node.content = node.content.replace(/[\t\r\n\f ]+/g, ' ')
+ }
+ } else {
+ // #6410 normalize windows newlines in <pre>:
+ // in SSR, browsers normalize server-rendered \r\n into a single \n
+ // in the DOM
+ node.content = node.content.replace(/\r\n/g, '\n')
+ }
+ }
+ }
+ if (removedWhitespace) {
+ el.children = nodes.filter(Boolean)
+ }
+}
+
+function addNode(node: TemplateChildNode) {
+ getParent().children.push(node)
+}
+
+function getParent() {
+ return elementStack[elementStack.length - 1] || currentRoot
+}
+
+function reset() {
+ tokenizer.reset()
+ tagname = ''
+ attribname = ''
+ attribvalue = ''
+ attribs = null
+ startIndex = 0
+ endIndex = 0
+ stack.length = 0
+ elementStack.length = 0
+ foreignContext.length = 1
+ foreignContext[0] = false
+}
export function baseParse(
- content: string,
+ input: string,
options: ParserOptions = {}
): RootNode {
- const root = createRoot([])
- parser.parse(content)
+ reset()
+ currentInput = input.trim()
+ currentOptions = options
+ htmlMode = !!options.htmlMode
+ const root = (currentRoot = createRoot([]))
+ tokenizer.parse(currentInput)
+ // temp hack for ts
+ console.log(endIndex)
return root
}