This commit is contained in:
Jarred Sumner
2022-06-03 18:49:12 -07:00
parent af6859acc2
commit 9f640ffb51
4 changed files with 403 additions and 0 deletions

View File

@@ -0,0 +1,86 @@
import { group } from "mitata";
import { bench, run } from "mitata";
var bunEscapeHTML = Bun.escapeHTML;
const matchHtmlRegExp = /["'&<>]/;
/**
* Escapes special characters and HTML entities in a given html string.
*
* @param {string} string HTML string to escape for later insertion
* @return {string}
* @public
*/
function reactEscapeHtml(string) {
const str = "" + string;
const match = matchHtmlRegExp.exec(str);
if (!match) {
return str;
}
let escape;
let html = "";
let index;
let lastIndex = 0;
for (index = match.index; index < str.length; index++) {
switch (str.charCodeAt(index)) {
case 34: // "
escape = "&quot;";
break;
case 38: // &
escape = "&amp;";
break;
case 39: // '
escape = "&#x27;"; // modified from escape-html; used to be '&#39'
break;
case 60: // <
escape = "&lt;";
break;
case 62: // >
escape = "&gt;";
break;
default:
continue;
}
if (lastIndex !== index) {
html += str.substring(lastIndex, index);
}
lastIndex = index + 1;
html += escape;
}
return lastIndex !== index ? html + str.substring(lastIndex, index) : html;
}
const long = ("lalala" + "<script>alert(1)</script>" + "lalala").repeat(9000);
const short = "lalala" + "<script>alert(1)</script>" + "lalala";
const middle =
"lalala".repeat(2000) + "<script>alert(1)</script>" + "lalala".repeat(2000);
const nothing = "lalala".repeat(9999);
group(`long (${long.length})`, () => {
bench("react's escapeHTML", () => reactEscapeHtml(long));
bench("bun's escapeHTML", () => bunEscapeHTML(long));
});
group(`short (${short.length})`, () => {
bench("react's escapeHTML", () => reactEscapeHtml(short));
bench("bun's escapeHTML", () => bunEscapeHTML(short));
});
group(`middle (${middle.length})`, () => {
bench("react's escapeHTML", () => reactEscapeHtml(middle));
bench("bun's escapeHTML", () => bunEscapeHTML(middle));
});
group(`nothing (${nothing.length})`, () => {
bench("react's escapeHTML", () => reactEscapeHtml(nothing));
bench("bun's escapeHTML", () => bunEscapeHTML(nothing));
});
await run();

View File

@@ -0,0 +1,54 @@
import { describe, it, expect } from "bun:test";
import { gcTick } from "./gc";
describe("Bun.escapeHTML", () => {
it("works", () => {
expect(Bun.escapeHTML("<script>alert(1)</script>")).toBe(
"&lt;script&gt;alert(1)&lt;/script&gt;"
);
expect(Bun.escapeHTML("<")).toBe("&lt;");
expect(Bun.escapeHTML(">")).toBe("&gt;");
expect(Bun.escapeHTML("&")).toBe("&amp;");
expect(Bun.escapeHTML("'")).toBe("&#x27;");
expect(Bun.escapeHTML('"')).toBe("&quot;");
expect(Bun.escapeHTML("\n")).toBe("\n");
expect(Bun.escapeHTML("\r")).toBe("\r");
expect(Bun.escapeHTML("\t")).toBe("\t");
expect(Bun.escapeHTML("\f")).toBe("\f");
expect(Bun.escapeHTML("\v")).toBe("\v");
expect(Bun.escapeHTML("\b")).toBe("\b");
expect(Bun.escapeHTML("\u00A0")).toBe("\u00A0");
// The matrix of cases we need to test for:
// 1. Works with short strings
// 2. Works with long strings
// 3. Works with latin1 strings
// 4. Works with utf16 strings
// 5. Works when the text to escape is somewhere in the middle
// 6. Works when the text to escape is in the beginning
// 7. Works when the text to escape is in the end
// 8. Returns the same string when there's no need to escape
expect(
Bun.escapeHTML("lalala" + "<script>alert(1)</script>" + "lalala")
).toBe("lalala&lt;script&gt;alert(1)&lt;/script&gt;lalala");
expect(Bun.escapeHTML("<script>alert(1)</script>" + "lalala")).toBe(
"&lt;script&gt;alert(1)&lt;/script&gt;lalala"
);
expect(Bun.escapeHTML("lalala" + "<script>alert(1)</script>")).toBe(
"lalala" + "&lt;script&gt;alert(1)&lt;/script&gt;"
);
expect(
Bun.escapeHTML(
("lalala" + "<script>alert(1)</script>" + "lalala").repeat(900)
)
).toBe("lalala&lt;script&gt;alert(1)&lt;/script&gt;lalala".repeat(900));
expect(
Bun.escapeHTML(("<script>alert(1)</script>" + "lalala").repeat(900))
).toBe("&lt;script&gt;alert(1)&lt;/script&gt;lalala".repeat(900));
expect(
Bun.escapeHTML(("lalala" + "<script>alert(1)</script>").repeat(900))
).toBe(("lalala" + "&lt;script&gt;alert(1)&lt;/script&gt;").repeat(900));
});
});

View File

@@ -1150,6 +1150,9 @@ pub const Class = NewClass(
.inflateSync = .{
.rfn = JSC.wrapWithHasContainer(JSZlib, "inflateSync", false, false, true),
},
.escapeHTML = .{
.rfn = Bun.escapeHTML,
},
},
.{
.main = .{
@@ -1612,6 +1615,42 @@ pub fn serve(
unreachable;
}
pub fn escapeHTML(
_: void,
ctx: js.JSContextRef,
_: js.JSObjectRef,
_: js.JSObjectRef,
arguments: []const js.JSValueRef,
exception: js.ExceptionRef,
) js.JSValueRef {
if (arguments.len < 1) {
return ZigString.init("").toValue(ctx).asObjectRef();
}
const input_value = arguments[0].?.value();
const zig_str = input_value.getZigString(ctx);
if (zig_str.is16Bit()) {
return input_value.asObjectRef();
} else {
var input_slice = zig_str.slice();
var escaped_html = strings.escapeHTMLForLatin1Input(ctx.bunVM().allocator, input_slice) catch {
JSC.JSError(undefined, "Out of memory", .{}, ctx, exception);
return null;
};
if (escaped_html.ptr == input_slice.ptr and escaped_html.len == input_slice.len) {
return input_value.asObjectRef();
}
if (input_slice.len == 1) {
// single character escaped strings are statically allocated
return ZigString.init(escaped_html).toValue(ctx).asObjectRef();
}
return ZigString.init(escaped_html).toExternalValue(ctx).asObjectRef();
}
}
pub fn allocUnsafe(
_: void,
ctx: js.JSContextRef,

View File

@@ -1316,6 +1316,230 @@ pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize {
return count;
}
pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) ![]const u8 {
switch (latin1.len) {
0 => return "",
1 => return switch (latin1[0]) {
'"' => "&quot;",
'&' => "&amp;",
'\'' => "&#x27;",
'<' => "&lt;",
'>' => "&gt;",
else => latin1,
},
else => {
var remaining = latin1;
const vec_chars = "\"&'<>";
const vecs: [vec_chars.len]AsciiVector = comptime brk: {
var _vecs: [vec_chars.len]AsciiVector = undefined;
for (vec_chars) |c, i| {
_vecs[i] = @splat(ascii_vector_size, c);
}
break :brk _vecs;
};
var buf: std.ArrayList(u8) = undefined;
var any_needs_escape = false;
if (comptime Environment.isAarch64 or Environment.isX64) {
// pass #1: scan for any characters that need escaping
// assume most strings won't need any escaping, so don't actually allocate the buffer
scan_and_allocate_lazily: while (remaining.len >= ascii_vector_size) {
if (comptime Environment.allow_assert) {
std.debug.assert(!any_needs_escape);
}
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(
.Or,
@bitCast(AsciiVectorU1, (vec == vecs[0])) |
@bitCast(AsciiVectorU1, (vec == vecs[1])) |
@bitCast(AsciiVectorU1, (vec == vecs[2])) |
@bitCast(AsciiVectorU1, (vec == vecs[3])) |
@bitCast(AsciiVectorU1, (vec == vecs[4])),
) == 1) {
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
@memcpy(buf.items.ptr, latin1.ptr, copy_len);
buf.items.len = copy_len;
any_needs_escape = true;
comptime var i: usize = 0;
inline while (i < ascii_vector_size) : (i += 1) {
switch (vec[i]) {
'"' => {
buf.appendSlice("&quot;") catch unreachable;
},
'&' => {
buf.appendSlice("&amp;") catch unreachable;
},
'\'' => {
buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
},
'<' => {
buf.appendSlice("&lt;") catch unreachable;
},
'>' => {
buf.appendSlice("&gt;") catch unreachable;
},
else => |c| {
buf.appendAssumeCapacity(c);
},
}
}
remaining = remaining[ascii_vector_size..];
break :scan_and_allocate_lazily;
}
remaining = remaining[ascii_vector_size..];
}
if (any_needs_escape) {
// pass #2: we found something that needed an escape
// so we'll go ahead and copy the buffer into a new buffer
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(
.Or,
@bitCast(AsciiVectorU1, (vec == vecs[0])) |
@bitCast(AsciiVectorU1, (vec == vecs[1])) |
@bitCast(AsciiVectorU1, (vec == vecs[2])) |
@bitCast(AsciiVectorU1, (vec == vecs[3])) |
@bitCast(AsciiVectorU1, (vec == vecs[4])),
) == 1) {
buf.ensureUnusedCapacity(ascii_vector_size) catch unreachable;
comptime var i: usize = 0;
inline while (i < ascii_vector_size) : (i += 1) {
switch (vec[i]) {
'"' => {
buf.appendSlice("&quot;") catch unreachable;
},
'&' => {
buf.appendSlice("&amp;") catch unreachable;
},
'\'' => {
buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
},
'<' => {
buf.appendSlice("&lt;") catch unreachable;
},
'>' => {
buf.appendSlice("&gt;") catch unreachable;
},
else => |c| {
buf.append(c) catch unreachable;
},
}
}
remaining = remaining[ascii_vector_size..];
continue;
}
try buf.ensureUnusedCapacity(ascii_vector_size);
buf.items.ptr[buf.items.len .. buf.items.len + ascii_vector_size][0..ascii_vector_size].* = remaining[0..ascii_vector_size].*;
buf.items.len += ascii_vector_size;
remaining = remaining[ascii_vector_size..];
}
}
}
if (!any_needs_escape) {
scan_and_allocate_lazily: while (remaining.len > 0) {
switch (remaining[0]) {
'"' => {
const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
@memcpy(buf.items.ptr, latin1.ptr, copy_len);
buf.items.len = copy_len;
buf.appendSlice("&quot;") catch unreachable;
remaining = remaining[1..];
any_needs_escape = true;
break :scan_and_allocate_lazily;
},
'&' => {
const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
@memcpy(buf.items.ptr, latin1.ptr, copy_len);
buf.items.len = copy_len;
buf.appendSlice("&amp;") catch unreachable;
remaining = remaining[1..];
any_needs_escape = true;
break :scan_and_allocate_lazily;
},
'\'' => {
const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
@memcpy(buf.items.ptr, latin1.ptr, copy_len);
buf.items.len = copy_len;
buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
remaining = remaining[1..];
any_needs_escape = true;
break :scan_and_allocate_lazily;
},
'<' => {
const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
@memcpy(buf.items.ptr, latin1.ptr, copy_len);
buf.items.len = copy_len;
buf.appendSlice("&lt;") catch unreachable;
remaining = remaining[1..];
any_needs_escape = true;
break :scan_and_allocate_lazily;
},
'>' => {
const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
@memcpy(buf.items.ptr, latin1.ptr, copy_len);
buf.items.len = copy_len;
buf.appendSlice("&gt;") catch unreachable;
remaining = remaining[1..];
any_needs_escape = true;
break :scan_and_allocate_lazily;
},
else => {
remaining = remaining[1..];
},
}
}
}
if (remaining.len > 0) {
std.debug.assert(any_needs_escape);
for (remaining) |c| {
switch (c) {
'"' => {
buf.appendSlice("&quot;") catch unreachable;
},
'&' => {
buf.appendSlice("&amp;") catch unreachable;
},
'\'' => {
buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
},
'<' => {
buf.appendSlice("&lt;") catch unreachable;
},
'>' => {
buf.appendSlice("&gt;") catch unreachable;
},
else => {
buf.append(c) catch unreachable;
},
}
}
}
if (any_needs_escape) {
return buf.toOwnedSlice();
} else {
return latin1;
}
},
}
}
test "copyLatin1IntoUTF8" {
var input: string = "hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!";
var output = std.mem.zeroes([500]u8);