impl #1

2026-02-13 12:29:07 +00:00 · 2022-06-03 18:49:12 -07:00
parent af6859acc2
commit 9f640ffb51
4 changed files with 403 additions and 0 deletions
--- a/bench/snippets/escapeHTML.js
+++ b/bench/snippets/escapeHTML.js
@@ -0,0 +1,86 @@
+import { group } from "mitata";
+import { bench, run } from "mitata";
+
+var bunEscapeHTML = Bun.escapeHTML;
+
+const matchHtmlRegExp = /["'&<>]/;
+
+/**
+ * Escapes special characters and HTML entities in a given html string.
+ *
+ * @param  {string} string HTML string to escape for later insertion
+ * @return {string}
+ * @public
+ */
+
+function reactEscapeHtml(string) {
+  const str = "" + string;
+  const match = matchHtmlRegExp.exec(str);
+
+  if (!match) {
+    return str;
+  }
+
+  let escape;
+  let html = "";
+  let index;
+  let lastIndex = 0;
+
+  for (index = match.index; index < str.length; index++) {
+    switch (str.charCodeAt(index)) {
+      case 34: // "
+        escape = "&quot;";
+        break;
+      case 38: // &
+        escape = "&amp;";
+        break;
+      case 39: // '
+        escape = "&#x27;"; // modified from escape-html; used to be '&#39'
+        break;
+      case 60: // <
+        escape = "&lt;";
+        break;
+      case 62: // >
+        escape = "&gt;";
+        break;
+      default:
+        continue;
+    }
+
+    if (lastIndex !== index) {
+      html += str.substring(lastIndex, index);
+    }
+
+    lastIndex = index + 1;
+    html += escape;
+  }
+
+  return lastIndex !== index ? html + str.substring(lastIndex, index) : html;
+}
+
+const long = ("lalala" + "<script>alert(1)</script>" + "lalala").repeat(9000);
+const short = "lalala" + "<script>alert(1)</script>" + "lalala";
+const middle =
+  "lalala".repeat(2000) + "<script>alert(1)</script>" + "lalala".repeat(2000);
+const nothing = "lalala".repeat(9999);
+group(`long (${long.length})`, () => {
+  bench("react's escapeHTML", () => reactEscapeHtml(long));
+  bench("bun's escapeHTML", () => bunEscapeHTML(long));
+});
+
+group(`short (${short.length})`, () => {
+  bench("react's escapeHTML", () => reactEscapeHtml(short));
+  bench("bun's escapeHTML", () => bunEscapeHTML(short));
+});
+
+group(`middle (${middle.length})`, () => {
+  bench("react's escapeHTML", () => reactEscapeHtml(middle));
+  bench("bun's escapeHTML", () => bunEscapeHTML(middle));
+});
+
+group(`nothing (${nothing.length})`, () => {
+  bench("react's escapeHTML", () => reactEscapeHtml(nothing));
+  bench("bun's escapeHTML", () => bunEscapeHTML(nothing));
+});
+
+await run();
--- a/integration/bunjs-only-snippets/escapeHTML.test.js
+++ b/integration/bunjs-only-snippets/escapeHTML.test.js
@@ -0,0 +1,54 @@
+import { describe, it, expect } from "bun:test";
+import { gcTick } from "./gc";
+
+describe("Bun.escapeHTML", () => {
+  it("works", () => {
+    expect(Bun.escapeHTML("<script>alert(1)</script>")).toBe(
+      "&lt;script&gt;alert(1)&lt;/script&gt;"
+    );
+    expect(Bun.escapeHTML("<")).toBe("&lt;");
+    expect(Bun.escapeHTML(">")).toBe("&gt;");
+    expect(Bun.escapeHTML("&")).toBe("&amp;");
+    expect(Bun.escapeHTML("'")).toBe("&#x27;");
+    expect(Bun.escapeHTML('"')).toBe("&quot;");
+    expect(Bun.escapeHTML("\n")).toBe("\n");
+    expect(Bun.escapeHTML("\r")).toBe("\r");
+    expect(Bun.escapeHTML("\t")).toBe("\t");
+    expect(Bun.escapeHTML("\f")).toBe("\f");
+    expect(Bun.escapeHTML("\v")).toBe("\v");
+    expect(Bun.escapeHTML("\b")).toBe("\b");
+    expect(Bun.escapeHTML("\u00A0")).toBe("\u00A0");
+
+    // The matrix of cases we need to test for:
+    // 1. Works with short strings
+    // 2. Works with long strings
+    // 3. Works with latin1 strings
+    // 4. Works with utf16 strings
+    // 5. Works when the text to escape is somewhere in the middle
+    // 6. Works when the text to escape is in the beginning
+    // 7. Works when the text to escape is in the end
+    // 8. Returns the same string when there's no need to escape
+    expect(
+      Bun.escapeHTML("lalala" + "<script>alert(1)</script>" + "lalala")
+    ).toBe("lalala&lt;script&gt;alert(1)&lt;/script&gt;lalala");
+
+    expect(Bun.escapeHTML("<script>alert(1)</script>" + "lalala")).toBe(
+      "&lt;script&gt;alert(1)&lt;/script&gt;lalala"
+    );
+    expect(Bun.escapeHTML("lalala" + "<script>alert(1)</script>")).toBe(
+      "lalala" + "&lt;script&gt;alert(1)&lt;/script&gt;"
+    );
+
+    expect(
+      Bun.escapeHTML(
+        ("lalala" + "<script>alert(1)</script>" + "lalala").repeat(900)
+      )
+    ).toBe("lalala&lt;script&gt;alert(1)&lt;/script&gt;lalala".repeat(900));
+    expect(
+      Bun.escapeHTML(("<script>alert(1)</script>" + "lalala").repeat(900))
+    ).toBe("&lt;script&gt;alert(1)&lt;/script&gt;lalala".repeat(900));
+    expect(
+      Bun.escapeHTML(("lalala" + "<script>alert(1)</script>").repeat(900))
+    ).toBe(("lalala" + "&lt;script&gt;alert(1)&lt;/script&gt;").repeat(900));
+  });
+});
--- a/src/javascript/jsc/api/bun.zig
+++ b/src/javascript/jsc/api/bun.zig
@@ -1150,6 +1150,9 @@ pub const Class = NewClass(
        .inflateSync = .{
            .rfn = JSC.wrapWithHasContainer(JSZlib, "inflateSync", false, false, true),
        },
+        .escapeHTML = .{
+            .rfn = Bun.escapeHTML,
+        },
    },
    .{
        .main = .{
@@ -1612,6 +1615,42 @@ pub fn serve(
    unreachable;
 }

+pub fn escapeHTML(
+    _: void,
+    ctx: js.JSContextRef,
+    _: js.JSObjectRef,
+    _: js.JSObjectRef,
+    arguments: []const js.JSValueRef,
+    exception: js.ExceptionRef,
+) js.JSValueRef {
+    if (arguments.len < 1) {
+        return ZigString.init("").toValue(ctx).asObjectRef();
+    }
+
+    const input_value = arguments[0].?.value();
+    const zig_str = input_value.getZigString(ctx);
+    if (zig_str.is16Bit()) {
+        return input_value.asObjectRef();
+    } else {
+        var input_slice = zig_str.slice();
+        var escaped_html = strings.escapeHTMLForLatin1Input(ctx.bunVM().allocator, input_slice) catch {
+            JSC.JSError(undefined, "Out of memory", .{}, ctx, exception);
+            return null;
+        };
+
+        if (escaped_html.ptr == input_slice.ptr and escaped_html.len == input_slice.len) {
+            return input_value.asObjectRef();
+        }
+
+        if (input_slice.len == 1) {
+            // single character escaped strings are statically allocated
+            return ZigString.init(escaped_html).toValue(ctx).asObjectRef();
+        }
+
+        return ZigString.init(escaped_html).toExternalValue(ctx).asObjectRef();
+    }
+}
+
 pub fn allocUnsafe(
    _: void,
    ctx: js.JSContextRef,
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1316,6 +1316,230 @@ pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize {
    return count;
 }

+pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) ![]const u8 {
+    switch (latin1.len) {
+        0 => return "",
+        1 => return switch (latin1[0]) {
+            '"' => "&quot;",
+            '&' => "&amp;",
+            '\'' => "&#x27;",
+            '<' => "&lt;",
+            '>' => "&gt;",
+            else => latin1,
+        },
+        else => {
+            var remaining = latin1;
+
+            const vec_chars = "\"&'<>";
+            const vecs: [vec_chars.len]AsciiVector = comptime brk: {
+                var _vecs: [vec_chars.len]AsciiVector = undefined;
+                for (vec_chars) |c, i| {
+                    _vecs[i] = @splat(ascii_vector_size, c);
+                }
+                break :brk _vecs;
+            };
+
+            var buf: std.ArrayList(u8) = undefined;
+            var any_needs_escape = false;
+
+            if (comptime Environment.isAarch64 or Environment.isX64) {
+
+                // pass #1: scan for any characters that need escaping
+                // assume most strings won't need any escaping, so don't actually allocate the buffer
+                scan_and_allocate_lazily: while (remaining.len >= ascii_vector_size) {
+                    if (comptime Environment.allow_assert) {
+                        std.debug.assert(!any_needs_escape);
+                    }
+
+                    const vec: AsciiVector = remaining[0..ascii_vector_size].*;
+                    if (@reduce(
+                        .Or,
+                        @bitCast(AsciiVectorU1, (vec == vecs[0])) |
+                            @bitCast(AsciiVectorU1, (vec == vecs[1])) |
+                            @bitCast(AsciiVectorU1, (vec == vecs[2])) |
+                            @bitCast(AsciiVectorU1, (vec == vecs[3])) |
+                            @bitCast(AsciiVectorU1, (vec == vecs[4])),
+                    ) == 1) {
+                        buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+                        const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+                        @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+                        buf.items.len = copy_len;
+                        any_needs_escape = true;
+                        comptime var i: usize = 0;
+                        inline while (i < ascii_vector_size) : (i += 1) {
+                            switch (vec[i]) {
+                                '"' => {
+                                    buf.appendSlice("&quot;") catch unreachable;
+                                },
+                                '&' => {
+                                    buf.appendSlice("&amp;") catch unreachable;
+                                },
+                                '\'' => {
+                                    buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
+                                },
+                                '<' => {
+                                    buf.appendSlice("&lt;") catch unreachable;
+                                },
+                                '>' => {
+                                    buf.appendSlice("&gt;") catch unreachable;
+                                },
+                                else => |c| {
+                                    buf.appendAssumeCapacity(c);
+                                },
+                            }
+                        }
+                        remaining = remaining[ascii_vector_size..];
+                        break :scan_and_allocate_lazily;
+                    }
+
+                    remaining = remaining[ascii_vector_size..];
+                }
+
+                if (any_needs_escape) {
+                    // pass #2: we found something that needed an escape
+                    // so we'll go ahead and copy the buffer into a new buffer
+                    while (remaining.len >= ascii_vector_size) {
+                        const vec: AsciiVector = remaining[0..ascii_vector_size].*;
+                        if (@reduce(
+                            .Or,
+                            @bitCast(AsciiVectorU1, (vec == vecs[0])) |
+                                @bitCast(AsciiVectorU1, (vec == vecs[1])) |
+                                @bitCast(AsciiVectorU1, (vec == vecs[2])) |
+                                @bitCast(AsciiVectorU1, (vec == vecs[3])) |
+                                @bitCast(AsciiVectorU1, (vec == vecs[4])),
+                        ) == 1) {
+                            buf.ensureUnusedCapacity(ascii_vector_size) catch unreachable;
+                            comptime var i: usize = 0;
+                            inline while (i < ascii_vector_size) : (i += 1) {
+                                switch (vec[i]) {
+                                    '"' => {
+                                        buf.appendSlice("&quot;") catch unreachable;
+                                    },
+                                    '&' => {
+                                        buf.appendSlice("&amp;") catch unreachable;
+                                    },
+                                    '\'' => {
+                                        buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
+                                    },
+                                    '<' => {
+                                        buf.appendSlice("&lt;") catch unreachable;
+                                    },
+                                    '>' => {
+                                        buf.appendSlice("&gt;") catch unreachable;
+                                    },
+                                    else => |c| {
+                                        buf.append(c) catch unreachable;
+                                    },
+                                }
+                            }
+
+                            remaining = remaining[ascii_vector_size..];
+                            continue;
+                        }
+
+                        try buf.ensureUnusedCapacity(ascii_vector_size);
+                        buf.items.ptr[buf.items.len .. buf.items.len + ascii_vector_size][0..ascii_vector_size].* = remaining[0..ascii_vector_size].*;
+                        buf.items.len += ascii_vector_size;
+                        remaining = remaining[ascii_vector_size..];
+                    }
+                }
+            }
+
+            if (!any_needs_escape) {
+                scan_and_allocate_lazily: while (remaining.len > 0) {
+                    switch (remaining[0]) {
+                        '"' => {
+                            const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+                            buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+                            @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+                            buf.items.len = copy_len;
+                            buf.appendSlice("&quot;") catch unreachable;
+                            remaining = remaining[1..];
+                            any_needs_escape = true;
+                            break :scan_and_allocate_lazily;
+                        },
+                        '&' => {
+                            const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+                            buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+                            @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+                            buf.items.len = copy_len;
+                            buf.appendSlice("&amp;") catch unreachable;
+                            remaining = remaining[1..];
+                            any_needs_escape = true;
+                            break :scan_and_allocate_lazily;
+                        },
+                        '\'' => {
+                            const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+                            buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+                            @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+                            buf.items.len = copy_len;
+                            buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
+                            remaining = remaining[1..];
+                            any_needs_escape = true;
+                            break :scan_and_allocate_lazily;
+                        },
+                        '<' => {
+                            const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+                            buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+                            @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+                            buf.items.len = copy_len;
+                            buf.appendSlice("&lt;") catch unreachable;
+                            remaining = remaining[1..];
+                            any_needs_escape = true;
+                            break :scan_and_allocate_lazily;
+                        },
+                        '>' => {
+                            const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+                            buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+                            @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+                            buf.items.len = copy_len;
+                            buf.appendSlice("&gt;") catch unreachable;
+                            remaining = remaining[1..];
+                            any_needs_escape = true;
+                            break :scan_and_allocate_lazily;
+                        },
+                        else => {
+                            remaining = remaining[1..];
+                        },
+                    }
+                }
+            }
+
+            if (remaining.len > 0) {
+                std.debug.assert(any_needs_escape);
+                for (remaining) |c| {
+                    switch (c) {
+                        '"' => {
+                            buf.appendSlice("&quot;") catch unreachable;
+                        },
+                        '&' => {
+                            buf.appendSlice("&amp;") catch unreachable;
+                        },
+                        '\'' => {
+                            buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
+                        },
+                        '<' => {
+                            buf.appendSlice("&lt;") catch unreachable;
+                        },
+                        '>' => {
+                            buf.appendSlice("&gt;") catch unreachable;
+                        },
+                        else => {
+                            buf.append(c) catch unreachable;
+                        },
+                    }
+                }
+            }
+
+            if (any_needs_escape) {
+                return buf.toOwnedSlice();
+            } else {
+                return latin1;
+            }
+        },
+    }
+}
+
 test "copyLatin1IntoUTF8" {
    var input: string = "hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!";
    var output = std.mem.zeroes([500]u8);