From 5c0e1790faa22a77a18aa1c242f9e0a0de7ee770 Mon Sep 17 00:00:00 2001
From: jviide
Date: Thu, 5 Jan 2023 09:21:09 +0200
Subject: [PATCH] fix: sanitize HTML from Mastodon servers (#788)
---
composables/content-parse.ts | 284 +++++++++++++++---
tests/__snapshots__/content-rich.test.ts.snap | 20 +-
tests/__snapshots__/html-parse.test.ts.snap | 6 +-
3 files changed, 260 insertions(+), 50 deletions(-)
diff --git a/composables/content-parse.ts b/composables/content-parse.ts
index 92211e90..9cc4bf15 100644
--- a/composables/content-parse.ts
+++ b/composables/content-parse.ts
@@ -1,7 +1,7 @@
// @unimport-disable
import type { Emoji } from 'masto'
import type { Node } from 'ultrahtml'
-import { TEXT_NODE, parse, render, walkSync } from 'ultrahtml'
+import { ELEMENT_NODE, TEXT_NODE, h, parse, render } from 'ultrahtml'
import { findAndReplaceEmojisInText } from '@iconify/utils'
import { emojiRegEx, getEmojiAttributes } from '../config/emojis'
@@ -19,53 +19,43 @@ export function decodeHtml(text: string) {
* with interop of custom emojis and inline Markdown syntax
*/
export function parseMastodonHTML(html: string, customEmojis: Record = {}, markdown = true, forTiptap = false) {
- // unicode emojis to images, but only if not converting HTML for Tiptap
- let processed = forTiptap ? html : replaceUnicodeEmoji(html)
-
- // custom emojis
- processed = processed.replace(/:([\w-]+?):/g, (_, name) => {
- const emoji = customEmojis[name]
- if (emoji)
- return ``
- return `:${name}:`
- })
-
if (markdown) {
- // handle code blocks
- processed = processed
+ // Handle code blocks
+ html = html
.replace(/>(```|~~~)(\w*)([\s\S]+?)\1/g, (_1, _2, lang, raw) => {
const code = htmlToText(raw)
const classes = lang ? ` class="language-${lang}"` : ''
return `>${code}
`
})
-
- walkSync(parse(processed), (node) => {
- if (node.type !== TEXT_NODE)
- return
- const replacements = [
- [/\*\*\*(.*?)\*\*\*/g, '$1'],
- [/\*\*(.*?)\*\*/g, '$1'],
- [/\*(.*?)\*/g, '$1'],
- [/~~(.*?)~~/g, '$1'],
- [/`([^`]+?)`/g, '$1
'],
- ] as any
-
- for (const [re, replacement] of replacements) {
- for (const match of node.value.matchAll(re)) {
- if (node.loc) {
- const start = match.index! + node.loc[0].start
- const end = start + match[0].length + node.loc[0].start
- processed = processed.slice(0, start) + match[0].replace(re, replacement) + processed.slice(end)
- }
- else {
- processed = processed.replace(match[0], match[0].replace(re, replacement))
- }
- }
- }
- })
}
- return parse(processed)
+ // Always sanitize the raw HTML data *after* it has been modified
+ const basicClasses = filterClasses(/^(h-\S*|p-\S*|u-\S*|dt-\S*|e-\S*|mention|hashtag|ellipsis|invisible)$/u)
+ return transformSync(parse(html), [
+ sanitize({
+ // Allow basic elements as seen in https://github.com/mastodon/mastodon/blob/17f79082b098e05b68d6f0d38fabb3ac121879a9/lib/sanitize_ext/sanitize_config.rb
+ br: {},
+ p: {},
+ a: {
+ href: filterHref(),
+ class: basicClasses,
+ rel: set('nofollow noopener noreferrer'),
+ target: set('_blank'),
+ },
+ span: {
+ class: basicClasses,
+ },
+ // Allow elements potentially created for Markdown code blocks above
+ pre: {},
+ code: {
+ class: filterClasses(/^language-\w+$/),
+ },
+ }),
+ // Unicode emojis to images, but only if not converting HTML for Tiptap
+ !forTiptap ? replaceUnicodeEmoji() : noopTransform(),
+ markdown ? formatMarkdown() : noopTransform(),
+ replaceCustomEmoji(customEmojis),
+ ])
}
/**
@@ -133,12 +123,210 @@ export function treeToText(input: Node): string {
return pre + body + post
}
-/**
- * Replace unicode emojis with locally hosted images
- */
-export function replaceUnicodeEmoji(html: string) {
- return findAndReplaceEmojisInText(emojiRegEx, html, (match) => {
- const attrs = getEmojiAttributes(match)
- return ``
- }) || html
+// A tree transform function takes an ultrahtml Node object and returns
+// new content that will replace the given node in the tree.
+// Returning a null removes the node from the tree.
+// Strings get converted to text nodes.
+// The input node's children have been transformed before the node itself
+// gets transformed.
+type Transform = (node: Node) => (Node | string)[] | Node | string | null
+
+// Helpers for transforming (filtering, modifying, ...) a parsed HTML tree
+// by running the given chain of transform functions one-by-one.
+function transformSync(doc: Node, transforms: Transform[]) {
+ function visit(node: Node, transform: Transform, isRoot = false) {
+ if (Array.isArray(node.children)) {
+ const children = [] as (Node | string)[]
+ for (let i = 0; i < node.children.length; i++) {
+ const result = visit(node.children[i], transform)
+ if (Array.isArray(result))
+ children.push(...result)
+
+ else if (result)
+ children.push(result)
+ }
+
+ node.children = children.map((value) => {
+ if (typeof value === 'string')
+ return { type: TEXT_NODE, value, parent: node }
+ value.parent = node
+ return value
+ })
+ }
+ return isRoot ? node : transform(node)
+ }
+
+ for (const transform of transforms)
+ doc = visit(doc, transform, true) as Node
+
+ return doc
+}
+
+// A transformation that does nothing. Useful for conditional transform chains.
+function noopTransform(): Transform {
+ return node => node
+}
+
+// A tree transform for sanitizing elements & their attributes.
+type AttrSanitizers = Record string | undefined>
+function sanitize(allowedElements: Record): Transform {
+ return (node) => {
+ if (node.type !== ELEMENT_NODE)
+ return node
+
+ if (!Object.prototype.hasOwnProperty.call(allowedElements, node.name))
+ return null
+
+ const attrSanitizers = allowedElements[node.name]
+ const attrs = {} as Record
+ for (const [name, func] of Object.entries(attrSanitizers)) {
+ const value = func(node.attributes[name])
+ if (value !== undefined)
+ attrs[name] = value
+ }
+ node.attributes = attrs
+ return node
+ }
+}
+
+function filterClasses(allowed: RegExp) {
+ return (c: string | undefined) => {
+ if (!c)
+ return undefined
+
+ return c.split(/\s/g).filter(cls => allowed.test(cls)).join(' ')
+ }
+}
+
+function set(value: string) {
+ return () => value
+}
+
+function filterHref() {
+ const LINK_PROTOCOLS = new Set([
+ 'http:',
+ 'https:',
+ 'dat:',
+ 'dweb:',
+ 'ipfs:',
+ 'ipns:',
+ 'ssb:',
+ 'gopher:',
+ 'xmpp:',
+ 'magnet:',
+ 'gemini:',
+ ])
+
+ return (href: string | undefined) => {
+ if (href === undefined)
+ return undefined
+
+ // Allow relative links
+ if (href.startsWith('/') || href.startsWith('.'))
+ return href
+
+ let url
+ try {
+ url = new URL(href)
+ }
+ catch (err) {
+ if (err instanceof TypeError)
+ return undefined
+ throw err
+ }
+
+ if (LINK_PROTOCOLS.has(url.protocol))
+ return url.toString()
+ return '#'
+ }
+}
+
+function replaceUnicodeEmoji(): Transform {
+ return (node) => {
+ if (node.type !== TEXT_NODE)
+ return node
+
+ let start = 0
+
+ const matches = [] as (string | Node)[]
+ findAndReplaceEmojisInText(emojiRegEx, node.value, (match, result) => {
+ const attrs = getEmojiAttributes(match)
+ matches.push(result.slice(start))
+ matches.push(h('img', { src: attrs.src, alt: attrs.alt, class: attrs.class }))
+ start = result.length + match.match.length
+ return undefined
+ })
+ if (matches.length === 0)
+ return node
+
+ matches.push(node.value.slice(start))
+ return matches.filter(Boolean)
+ }
+}
+
+function replaceCustomEmoji(customEmojis: Record): Transform {
+ return (node) => {
+ if (node.type !== TEXT_NODE)
+ return node
+
+ const split = node.value.split(/:([\w-]+?):/g)
+ if (split.length === 1)
+ return node
+
+ return split.map((name, i) => {
+ if (i % 2 === 0)
+ return name
+
+ const emoji = customEmojis[name]
+ if (!emoji)
+ return `:${name}:`
+
+ return h('img', { 'src': emoji.url, 'alt': `:${name}:`, 'class': 'custom-emoji', 'data-emoji-id': name })
+ }).filter(Boolean)
+ }
+}
+
+function formatMarkdown(): Transform {
+ const replacements: [RegExp, (c: (string | Node)[]) => Node][] = [
+ [/\*\*\*(.*?)\*\*\*/g, c => h('b', null, [h('em', null, c)])],
+ [/\*\*(.*?)\*\*/g, c => h('b', null, c)],
+ [/\*(.*?)\*/g, c => h('em', null, c)],
+ [/~~(.*?)~~/g, c => h('del', null, c)],
+ [/`([^`]+?)`/g, c => h('code', null, c)],
+ ]
+
+ function process(value: string) {
+ const results = [] as (string | Node)[]
+
+ let start = 0
+ while (true) {
+ let found: { match: RegExpMatchArray; replacer: (c: (string | Node)[]) => Node } | undefined
+
+ for (const [re, replacer] of replacements) {
+ re.lastIndex = start
+
+ const match = re.exec(value)
+ if (match) {
+ if (!found || match.index < found.match.index!)
+ found = { match, replacer }
+ }
+ }
+
+ if (!found)
+ break
+
+ results.push(value.slice(start, found.match.index))
+ results.push(found.replacer(process(found.match[1])))
+ start = found.match.index! + found.match[0].length
+ }
+
+ results.push(value.slice(start))
+ return results.filter(Boolean)
+ }
+
+ return (node) => {
+ if (node.type !== TEXT_NODE)
+ return node
+ return process(node.value)
+ }
}
diff --git a/tests/__snapshots__/content-rich.test.ts.snap b/tests/__snapshots__/content-rich.test.ts.snap
index 835b6a09..dcc39728 100644
--- a/tests/__snapshots__/content-rich.test.ts.snap
+++ b/tests/__snapshots__/content-rich.test.ts.snap
@@ -1,5 +1,12 @@
// Vitest Snapshot v1
+exports[`content-rich > JavaScript hrefs get removed 1`] = `
+"
+ click me
+
+"
+`;
+
exports[`content-rich > code frame 1`] = `
"Testing code block
import { useMouse, usePreferredDark } from '@vueuse/core'
// tracks mouse position
@@ -10,7 +17,13 @@ const isDark = usePreferredDark()
"
exports[`content-rich > code frame 2 1`] = `
"
-
+
Testing
const a = hello
@@ -62,3 +75,8 @@ exports[`content-rich > link + mention 1`] = `
"
`;
+
+exports[`content-rich > script tags get removed 1`] = `
+"
+"
+`;
diff --git a/tests/__snapshots__/html-parse.test.ts.snap b/tests/__snapshots__/html-parse.test.ts.snap
index 863cd499..6cd2fdcd 100644
--- a/tests/__snapshots__/html-parse.test.ts.snap
+++ b/tests/__snapshots__/html-parse.test.ts.snap
@@ -23,7 +23,11 @@ const isDark = usePreferredDark()
exports[`html-parse > code frame 2 > html 1`] = `
"
@antfu