refactor: html parsing

This commit is contained in:
Anthony Fu 2023-01-07 10:31:48 +01:00
parent d76e4bfaa5
commit 6944a74653
8 changed files with 152 additions and 124 deletions

View file

@ -4,27 +4,23 @@ defineOptions({
name: 'ContentRich', name: 'ContentRich',
}) })
const { content, emojis, markdown = true } = defineProps<{ const {
content,
emojis,
markdown = true,
} = defineProps<{
content: string content: string
markdown?: boolean
emojis?: Emoji[] emojis?: Emoji[]
markdown?: boolean
}>() }>()
const useEmojis = computed(() => { const emojisObject = useEmojisFallback(() => emojis)
const result: Emoji[] = []
if (emojis)
result.push(...emojis)
result.push(...currentCustomEmojis.value.emojis)
return emojisArrayToObject(result)
})
export default () => h( export default () => h(
'span', 'span',
{ class: 'content-rich', dir: 'auto' }, { class: 'content-rich', dir: 'auto' },
contentToVNode(content, { contentToVNode(content, {
emojis: useEmojis.value, emojis: emojisObject.value,
markdown, markdown,
}), }),
) )

View file

@ -1,23 +1,37 @@
<script setup lang="ts"> <script setup lang="ts">
import type { Status, StatusEdit } from 'masto' import type { Status, StatusEdit } from 'masto'
const { status, withAction = true } = defineProps<{ const {
status,
withAction = true,
} = defineProps<{
status: Status | StatusEdit status: Status | StatusEdit
withAction?: boolean withAction?: boolean
}>() }>()
const { translation } = useTranslation(status) const { translation } = useTranslation(status)
const emojisObject = useEmojisFallback(() => status.emojis)
const vnode = $computed(() => {
if (!status.content)
return null
const vnode = contentToVNode(status.content, {
emojis: emojisObject.value,
markdown: true,
})
return vnode
})
</script> </script>
<template> <template>
<div class="status-body" whitespace-pre-wrap break-words :class="{ 'with-action': withAction }"> <div class="status-body" whitespace-pre-wrap break-words :class="{ 'with-action': withAction }">
<ContentRich <span
v-if="status.content" v-if="status.content"
class="line-compact" class="content-rich line-compact" dir="auto"
:content="status.content" :lang="('language' in status && status.language) || undefined"
:emojis="status.emojis" >
:lang="'language' in status && status.language" <component :is="vnode" />
/> </span>
<div v-else /> <div v-else />
<template v-if="translation.visible"> <template v-if="translation.visible">
<div my2 h-px border="b base" bg-base /> <div my2 h-px border="b base" bg-base />

View file

@ -5,6 +5,34 @@ import { ELEMENT_NODE, TEXT_NODE, h, parse, render } from 'ultrahtml'
import { findAndReplaceEmojisInText } from '@iconify/utils' import { findAndReplaceEmojisInText } from '@iconify/utils'
import { emojiRegEx, getEmojiAttributes } from '../config/emojis' import { emojiRegEx, getEmojiAttributes } from '../config/emojis'
export interface ContentParseOptions {
emojis?: Record<string, Emoji>
markdown?: boolean
replaceUnicodeEmoji?: boolean
astTransforms?: Transform[]
}
const sanitizerBasicClasses = filterClasses(/^(h-\S*|p-\S*|u-\S*|dt-\S*|e-\S*|mention|hashtag|ellipsis|invisible)$/u)
const sanitizer = sanitize({
// Allow basic elements as seen in https://github.com/mastodon/mastodon/blob/17f79082b098e05b68d6f0d38fabb3ac121879a9/lib/sanitize_ext/sanitize_config.rb
br: {},
p: {},
a: {
href: filterHref(),
class: sanitizerBasicClasses,
rel: set('nofollow noopener noreferrer'),
target: set('_blank'),
},
span: {
class: sanitizerBasicClasses,
},
// Allow elements potentially created for Markdown code blocks above
pre: {},
code: {
class: filterClasses(/^language-\w+$/),
},
})
const decoder = process.client ? document.createElement('textarea') : null const decoder = process.client ? document.createElement('textarea') : null
export function decodeHtml(text: string) { export function decodeHtml(text: string) {
if (!decoder) if (!decoder)
@ -18,11 +46,19 @@ export function decodeHtml(text: string) {
* Parse raw HTML form Mastodon server to AST, * Parse raw HTML form Mastodon server to AST,
* with interop of custom emojis and inline Markdown syntax * with interop of custom emojis and inline Markdown syntax
*/ */
export function parseMastodonHTML(html: string, customEmojis: Record<string, Emoji> = {}, markdown = true, forTiptap = false) { export function parseMastodonHTML(
html: string,
options: ContentParseOptions = {},
) {
const {
markdown = true,
replaceUnicodeEmoji = true,
} = options
if (markdown) { if (markdown) {
// Handle code blocks // Handle code blocks
html = html html = html
.replace(/>(```|~~~)(\w*)([\s\S]+?)\1/g, (_1, _2, lang, raw) => { .replace(/>(```|~~~)(\w*)([\s\S]+?)\1/g, (_1, _2, lang: string, raw: string) => {
const code = htmlToText(raw) const code = htmlToText(raw)
const classes = lang ? ` class="language-${lang}"` : '' const classes = lang ? ` class="language-${lang}"` : ''
return `><pre><code${classes}>${code}</code></pre>` return `><pre><code${classes}>${code}</code></pre>`
@ -30,39 +66,31 @@ export function parseMastodonHTML(html: string, customEmojis: Record<string, Emo
} }
// Always sanitize the raw HTML data *after* it has been modified // Always sanitize the raw HTML data *after* it has been modified
const basicClasses = filterClasses(/^(h-\S*|p-\S*|u-\S*|dt-\S*|e-\S*|mention|hashtag|ellipsis|invisible)$/u) const transforms: Transform[] = [
return transformSync(parse(html), [ sanitizer,
sanitize({ ...options.astTransforms || [],
// Allow basic elements as seen in https://github.com/mastodon/mastodon/blob/17f79082b098e05b68d6f0d38fabb3ac121879a9/lib/sanitize_ext/sanitize_config.rb ]
br: {},
p: {}, if (replaceUnicodeEmoji)
a: { transforms.push(transformUnicodeEmoji)
href: filterHref(),
class: basicClasses, if (markdown)
rel: set('nofollow noopener noreferrer'), transforms.push(transformMarkdown)
target: set('_blank'),
}, transforms.push(replaceCustomEmoji(options.emojis || {}))
span: {
class: basicClasses, return transformSync(parse(html), transforms)
},
// Allow elements potentially created for Markdown code blocks above
pre: {},
code: {
class: filterClasses(/^language-\w+$/),
},
}),
// Unicode emojis to images, but only if not converting HTML for Tiptap
!forTiptap ? replaceUnicodeEmoji() : noopTransform(),
markdown ? formatMarkdown() : noopTransform(),
replaceCustomEmoji(customEmojis),
])
} }
/** /**
* Converts raw HTML form Mastodon server to HTML for Tiptap editor * Converts raw HTML form Mastodon server to HTML for Tiptap editor
*/ */
export function convertMastodonHTML(html: string, customEmojis: Record<string, Emoji> = {}) { export function convertMastodonHTML(html: string, customEmojis: Record<string, Emoji> = {}) {
const tree = parseMastodonHTML(html, customEmojis, true, true) const tree = parseMastodonHTML(html, {
emojis: customEmojis,
markdown: true,
replaceUnicodeEmoji: false,
})
return render(tree) return render(tree)
} }
@ -162,11 +190,6 @@ function transformSync(doc: Node, transforms: Transform[]) {
return doc return doc
} }
// A transformation that does nothing. Useful for conditional transform chains.
function noopTransform(): Transform {
return node => node
}
// A tree transform for sanitizing elements & their attributes. // A tree transform for sanitizing elements & their attributes.
type AttrSanitizers = Record<string, (value: string | undefined) => string | undefined> type AttrSanitizers = Record<string, (value: string | undefined) => string | undefined>
function sanitize(allowedElements: Record<string, AttrSanitizers>): Transform { function sanitize(allowedElements: Record<string, AttrSanitizers>): Transform {
@ -241,27 +264,25 @@ function filterHref() {
} }
} }
function replaceUnicodeEmoji(): Transform { function transformUnicodeEmoji(node: Node) {
return (node) => { if (node.type !== TEXT_NODE)
if (node.type !== TEXT_NODE) return node
return node
let start = 0 let start = 0
const matches = [] as (string | Node)[] const matches = [] as (string | Node)[]
findAndReplaceEmojisInText(emojiRegEx, node.value, (match, result) => { findAndReplaceEmojisInText(emojiRegEx, node.value, (match, result) => {
const attrs = getEmojiAttributes(match) const attrs = getEmojiAttributes(match)
matches.push(result.slice(start)) matches.push(result.slice(start))
matches.push(h('img', { src: attrs.src, alt: attrs.alt, class: attrs.class })) matches.push(h('img', { src: attrs.src, alt: attrs.alt, class: attrs.class }))
start = result.length + match.match.length start = result.length + match.match.length
return undefined return undefined
}) })
if (matches.length === 0) if (matches.length === 0)
return node return node
matches.push(node.value.slice(start)) matches.push(node.value.slice(start))
return matches.filter(Boolean) return matches.filter(Boolean)
}
} }
function replaceCustomEmoji(customEmojis: Record<string, Emoji>): Transform { function replaceCustomEmoji(customEmojis: Record<string, Emoji>): Transform {
@ -286,47 +307,45 @@ function replaceCustomEmoji(customEmojis: Record<string, Emoji>): Transform {
} }
} }
function formatMarkdown(): Transform { const _markdownReplacements: [RegExp, (c: (string | Node)[]) => Node][] = [
const replacements: [RegExp, (c: (string | Node)[]) => Node][] = [ [/\*\*\*(.*?)\*\*\*/g, c => h('b', null, [h('em', null, c)])],
[/\*\*\*(.*?)\*\*\*/g, c => h('b', null, [h('em', null, c)])], [/\*\*(.*?)\*\*/g, c => h('b', null, c)],
[/\*\*(.*?)\*\*/g, c => h('b', null, c)], [/\*(.*?)\*/g, c => h('em', null, c)],
[/\*(.*?)\*/g, c => h('em', null, c)], [/~~(.*?)~~/g, c => h('del', null, c)],
[/~~(.*?)~~/g, c => h('del', null, c)], [/`([^`]+?)`/g, c => h('code', null, c)],
[/`([^`]+?)`/g, c => h('code', null, c)], ]
]
function process(value: string) { function _markdownProcess(value: string) {
const results = [] as (string | Node)[] const results = [] as (string | Node)[]
let start = 0 let start = 0
while (true) { while (true) {
let found: { match: RegExpMatchArray; replacer: (c: (string | Node)[]) => Node } | undefined let found: { match: RegExpMatchArray; replacer: (c: (string | Node)[]) => Node } | undefined
for (const [re, replacer] of replacements) { for (const [re, replacer] of _markdownReplacements) {
re.lastIndex = start re.lastIndex = start
const match = re.exec(value) const match = re.exec(value)
if (match) { if (match) {
if (!found || match.index < found.match.index!) if (!found || match.index < found.match.index!)
found = { match, replacer } found = { match, replacer }
}
} }
if (!found)
break
results.push(value.slice(start, found.match.index))
results.push(found.replacer(process(found.match[1])))
start = found.match.index! + found.match[0].length
} }
results.push(value.slice(start)) if (!found)
return results.filter(Boolean) break
results.push(value.slice(start, found.match.index))
results.push(found.replacer(_markdownProcess(found.match[1])))
start = found.match.index! + found.match[0].length
} }
return (node) => { results.push(value.slice(start))
if (node.type !== TEXT_NODE) return results.filter(Boolean)
return node }
return process(node.value)
} function transformMarkdown(node: Node) {
if (node.type !== TEXT_NODE)
return node
return _markdownProcess(node.value)
} }

View file

@ -1,9 +1,9 @@
import type { Emoji } from 'masto'
import { TEXT_NODE } from 'ultrahtml' import { TEXT_NODE } from 'ultrahtml'
import type { Node } from 'ultrahtml' import type { Node } from 'ultrahtml'
import { Fragment, h, isVNode } from 'vue' import { Fragment, h, isVNode } from 'vue'
import type { VNode } from 'vue' import type { VNode } from 'vue'
import { RouterLink } from 'vue-router' import { RouterLink } from 'vue-router'
import type { ContentParseOptions } from './content-parse'
import { decodeHtml, parseMastodonHTML } from './content-parse' import { decodeHtml, parseMastodonHTML } from './content-parse'
import ContentCode from '~/components/content/ContentCode.vue' import ContentCode from '~/components/content/ContentCode.vue'
import AccountHoverWrapper from '~/components/account/AccountHoverWrapper.vue' import AccountHoverWrapper from '~/components/account/AccountHoverWrapper.vue'
@ -13,12 +13,9 @@ import AccountHoverWrapper from '~/components/account/AccountHoverWrapper.vue'
*/ */
export function contentToVNode( export function contentToVNode(
content: string, content: string,
{ emojis = {}, markdown = true }: { options?: ContentParseOptions,
emojis?: Record<string, Emoji>
markdown?: boolean
} = {},
): VNode { ): VNode {
const tree = parseMastodonHTML(content, emojis, markdown) const tree = parseMastodonHTML(content, options)
return h(Fragment, (tree.children as Node[]).map(n => treeToVNode(n))) return h(Fragment, (tree.children as Node[]).map(n => treeToVNode(n)))
} }

View file

@ -51,3 +51,16 @@ export const customEmojisData = computed(() => currentCustomEmojis.value.emojis.
emojis: transformEmojiData(currentCustomEmojis.value.emojis), emojis: transformEmojiData(currentCustomEmojis.value.emojis),
}] }]
: undefined) : undefined)
export function useEmojisFallback(emojisGetter: () => Emoji[] | undefined) {
return computed(() => {
const result: Emoji[] = []
const emojis = emojisGetter()
if (emojis)
result.push(...emojis)
result.push(...currentCustomEmojis.value.emojis)
return emojisArrayToObject(result)
})
}

View file

@ -1,3 +1,4 @@
// @unimport-disabled
import { emojiFilename, emojiPrefix, emojiRegEx } from '@iconify-emoji/twemoji' import { emojiFilename, emojiPrefix, emojiRegEx } from '@iconify-emoji/twemoji'
import type { EmojiRegexMatch } from '@iconify/utils/lib/emoji/replace/find' import type { EmojiRegexMatch } from '@iconify/utils/lib/emoji/replace/find'
import { getEmojiMatchesInText } from '@iconify/utils/lib/emoji/replace/find' import { getEmojiMatchesInText } from '@iconify/utils/lib/emoji/replace/find'

View file

@ -1,12 +1,5 @@
// Vitest Snapshot v1 // Vitest Snapshot v1
exports[`content-rich > JavaScript hrefs get removed 1`] = `
"<p>
<a href=\\"#\\" rel=\\"nofollow noopener noreferrer\\" target=\\"_blank\\">click me</a>
</p>
"
`;
exports[`content-rich > code frame 1`] = ` exports[`content-rich > code frame 1`] = `
"<p>Testing code block</p><p><pre lang=\\"ts\\">import { useMouse, usePreferredDark } from &#39;@vueuse/core&#39; "<p>Testing code block</p><p><pre lang=\\"ts\\">import { useMouse, usePreferredDark } from &#39;@vueuse/core&#39;
// tracks mouse position // tracks mouse position
@ -75,8 +68,3 @@ exports[`content-rich > link + mention 1`] = `
</p> </p>
" "
`; `;
exports[`content-rich > script tags get removed 1`] = `
"<p></p>
"
`;

View file

@ -67,7 +67,7 @@ describe('html-parse', () => {
}) })
async function render(input: string, emojis?: Record<string, Emoji>) { async function render(input: string, emojis?: Record<string, Emoji>) {
const tree = parseMastodonHTML(input, emojis) const tree = parseMastodonHTML(input, { emojis })
const html = await renderTree(tree) const html = await renderTree(tree)
let formatted = '' let formatted = ''
const serializedText = treeToText(tree).trim() const serializedText = treeToText(tree).trim()