mirror of
https://github.com/oven-sh/bun
synced 2026-02-10 02:48:50 +00:00
373 lines
11 KiB
TypeScript
373 lines
11 KiB
TypeScript
import { decodeURIComponentSIMD } from "bun:internal-for-testing";
|
|
import { describe, expect, it } from "bun:test";
|
|
|
|
const inputs = [
|
|
"hello world",
|
|
"hello world ",
|
|
" hello world",
|
|
"!@#$%^&*()",
|
|
"1234567890",
|
|
"abcdefghijklmnopqrstuvwxyz",
|
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
|
|
"こんにちは",
|
|
"你好",
|
|
"안녕하세요",
|
|
"مرحبا",
|
|
"שָׁלוֹם",
|
|
"🌍🌎🌏",
|
|
"👨👩👧👦",
|
|
"🇺🇸🇯🇵🇰🇷",
|
|
"https://example.com/path?param=value",
|
|
"user@example.com",
|
|
"path/to/file.txt",
|
|
"C:\\Windows\\System32",
|
|
"<script>alert('xss')</script>",
|
|
"SELECT * FROM users;",
|
|
"{}[]|\\",
|
|
" ",
|
|
"",
|
|
"a".repeat(1000),
|
|
"🌟".repeat(100),
|
|
"hello\nworld",
|
|
"hello\tworld",
|
|
"hello\rworld",
|
|
"hello\\world",
|
|
'hello"world',
|
|
"hello'world",
|
|
"hello`world",
|
|
"hello/world",
|
|
"hello?world",
|
|
"hello=world",
|
|
"hello&world",
|
|
"hello+world",
|
|
"hello%20world",
|
|
"hello%2Fworld",
|
|
"hello%3Fworld",
|
|
"hello%3Dworld",
|
|
"hello%26world",
|
|
"hello%2Bworld",
|
|
"hello%25world",
|
|
"hello%23world",
|
|
"hello%40world",
|
|
"hello%21world",
|
|
"hello%24world",
|
|
"hello%2Cworld",
|
|
"hello%3Bworld",
|
|
"hello%3Aworld",
|
|
"hello%5Bworld",
|
|
"hello%5Dworld",
|
|
"hello%7Bworld",
|
|
"hello%7Dworld",
|
|
"hello%7Cworld",
|
|
"hello%5Cworld",
|
|
"hello%22world",
|
|
"hello%27world",
|
|
"hello%60world",
|
|
"hello%3Cworld",
|
|
"hello%3Eworld",
|
|
"hello%2Eworld",
|
|
"hello%2Dworld",
|
|
"hello%5Fworld",
|
|
"hello%7Eworld",
|
|
"hello%2Aworld",
|
|
"hello%2Bworld",
|
|
"hello%2Cworld",
|
|
"hello%2Fworld",
|
|
"hello%3Aworld",
|
|
"hello%3Bworld",
|
|
"hello%3Cworld",
|
|
"hello%3Dworld",
|
|
"hello%3Eworld",
|
|
"hello%3Fworld",
|
|
"hello%40world",
|
|
"hello%5Bworld",
|
|
"hello%5Cworld",
|
|
"hello%5Dworld",
|
|
"hello%5Eworld",
|
|
"hello%5Fworld",
|
|
"hello%60world",
|
|
"hello%7Bworld",
|
|
"hello%7Cworld",
|
|
"hello%7Dworld",
|
|
"hello%7Eworld",
|
|
"hello%7Fworld",
|
|
"hello%80world",
|
|
"hello%FFworld",
|
|
"hello%F0%9F%8C%9F",
|
|
"hello%F0%9F%98%80",
|
|
"hello%F0%9F%98%81",
|
|
"hello%F0%9F%98%82",
|
|
"hello%F0%9F%98%83",
|
|
"hello%F0%9F%98%84",
|
|
"hello%F0%9F%98%85",
|
|
"hello%F0%9F%98%86",
|
|
"hello%F0%9F%98%87",
|
|
"hello%F0%9F%98%88",
|
|
"hello%F0%9F%98%89",
|
|
"hello%F0%9F%98%8A",
|
|
"hello%F0%9F%98%8B",
|
|
"hello%F0%9F%98%8C",
|
|
"hello%F0%9F%98%8D",
|
|
"hello%F0%9F%98%8E",
|
|
"hello%F0%9F%98%8F",
|
|
"hello%F0%9F%98%90",
|
|
"hello%F0%9F%98%91",
|
|
// Test 16-byte boundary cases
|
|
"1234567890123456%20", // % at byte 16
|
|
"123456789012345%20a", // % at byte 15
|
|
"12345678901234%20ab", // % at byte 14
|
|
"1234567890123%20abc", // % at byte 13
|
|
"123456789012%20abcd", // % at byte 12
|
|
"12345678901%20abcde", // % at byte 11
|
|
"1234567890%20abcdef", // % at byte 10
|
|
"123456789%20abcdefg", // % at byte 9
|
|
"12345678%20abcdefgh", // % at byte 8
|
|
"1234567%20abcdefghi", // % at byte 7
|
|
"123456%20abcdefghij", // % at byte 6
|
|
"12345%20abcdefghijk", // % at byte 5
|
|
"1234%20abcdefghijkl", // % at byte 4
|
|
"123%20abcdefghijklm", // % at byte 3
|
|
"12%20abcdefghijklmn", // % at byte 2
|
|
"1%20abcdefghijklmno", // % at byte 1
|
|
"%20abcdefghijklmnop", // % at byte 0
|
|
"1234567890123456%20abcd", // Multiple of 16 before %
|
|
"12345678901234567890%20", // Multiple of 16 + 4 before %
|
|
"123456789012345678901234567890%20", // Multiple of 16 + 14 before %
|
|
|
|
// Additional boundary tests with different encoded characters
|
|
"1234567890123456%2B", // + at boundary
|
|
"1234567890123456%3D", // = at boundary
|
|
"1234567890123456%2F", // / at boundary
|
|
"1234567890123456%3F", // ? at boundary
|
|
"1234567890123456%26", // & at boundary
|
|
|
|
// Multiple percent encodings near boundaries
|
|
"12345678901234%20%20", // Two spaces at boundary
|
|
"1234567890123%20%20a", // Two spaces near boundary
|
|
"123456789012%20%20ab", // Two spaces near boundary
|
|
|
|
// UTF-8 multi-byte sequences at boundaries
|
|
"1234567890123456%F0%9F%98%80", // Emoji at boundary
|
|
"12345678901234%F0%9F%98%80ab", // Emoji near boundary
|
|
"123456789012%F0%9F%98%80abcd", // Emoji near boundary
|
|
|
|
// Mixed ASCII and encoded characters
|
|
"1234567890123456%20ABC%20",
|
|
"1234567890123456%20%F0%9F%98%80",
|
|
"12345678901234%20%F0%9F%98%80ab",
|
|
|
|
// Multiple boundaries in sequence
|
|
"1234567890123456%201234567890123456%20",
|
|
"1234567890123456%201234567890123456%2B",
|
|
"1234567890123456%201234567890123456%3D",
|
|
|
|
// Testing with different encoded characters at boundaries
|
|
"1234567890123456%251234567890123456%24",
|
|
"1234567890123456%261234567890123456%23",
|
|
"1234567890123456%271234567890123456%22",
|
|
|
|
// Testing with invalid sequences at boundaries
|
|
"1234567890123456%", // Incomplete percent encoding at boundary
|
|
"1234567890123456%2", // Incomplete percent encoding at boundary
|
|
"1234567890123456%G0", // Invalid hex digit at boundary
|
|
|
|
// Testing with multiple encodings in quick succession
|
|
"12345678901234%20%20%20%20",
|
|
"1234567890123%20%20%20%20a",
|
|
"123456789012%20%20%20%20ab",
|
|
|
|
// Testing with mixed valid and invalid sequences
|
|
"1234567890123456%20%GG%20",
|
|
"1234567890123456%20%%20",
|
|
"1234567890123456%20%2%20",
|
|
|
|
// Testing boundaries with special characters
|
|
"1234567890123456%0A", // newline
|
|
"1234567890123456%0D", // carriage return
|
|
"1234567890123456%09", // tab
|
|
|
|
// Testing with URL-specific characters
|
|
"1234567890123456%3A%2F%2F", // ://
|
|
"1234567890123456%3F%3D%26", // ?=&
|
|
"1234567890123456%23%40%21", // #@!
|
|
|
|
// Testing with multiple boundaries and mixed content
|
|
"1234567890123456%201234567890123456%F0%9F%98%80",
|
|
"1234567890123456%2B1234567890123456%20%F0%9F%98%80",
|
|
"1234567890123456%3D1234567890123456%20ABC%20",
|
|
|
|
// Edge cases with repeated patterns
|
|
"1234567890123456%20%20%20%201234567890123456%20%20%20%20",
|
|
"1234567890123456%25%25%25%251234567890123456%25%25%25%25",
|
|
"1234567890123456%2B%2B%2B%2B1234567890123456%2B%2B%2B%2B",
|
|
];
|
|
|
|
// Additional test cases for production quality URI component decoder
|
|
const additionalInputs = [
|
|
// 1. Invalid UTF-8 Sequences
|
|
|
|
// Incomplete UTF-8 sequences
|
|
"%E2%82", // Incomplete euro symbol
|
|
"%F0%90", // Incomplete 4-byte sequence
|
|
"%C2", // Incomplete 2-byte sequence
|
|
|
|
// Overlong encodings
|
|
"%C0%AF", // Overlong '/' (should be %2F)
|
|
"%E0%80%AF", // Overlong '/' (3-byte)
|
|
"%F0%80%80%AF", // Overlong '/' (4-byte)
|
|
|
|
// Invalid UTF-8 continuation bytes
|
|
"%C2%C0", // Invalid continuation
|
|
"%E2%82%C0", // Invalid continuation in 3-byte sequence
|
|
"%F0%90%80%C0", // Invalid continuation in 4-byte sequence
|
|
|
|
// UTF-16 surrogate halves encoded in UTF-8
|
|
"%ED%A0%80", // Lead surrogate U+D800
|
|
"%ED%BE%80", // Trail surrogate U+DFFF
|
|
"%ED%A0%80%ED%B0%80", // Surrogate pair encoded in UTF-8
|
|
|
|
// 2. Memory and Buffer Edge Cases
|
|
|
|
// SIMD boundary alignment
|
|
"a".repeat(15) + "%20", // 15 chars + encoded char
|
|
"a".repeat(16) + "%20", // 16 chars + encoded char
|
|
"a".repeat(31) + "%20", // 31 chars + encoded char
|
|
"a".repeat(32) + "%20", // 32 chars + encoded char
|
|
|
|
// Large strings
|
|
"a".repeat(1024) + "%20" + "b".repeat(1024),
|
|
"%20".repeat(1000), // Many encoded characters
|
|
("a".repeat(15) + "%20").repeat(100), // Repeating pattern at SIMD boundary
|
|
|
|
// StringBuilder reallocation
|
|
"%F0%9F%98%80".repeat(1000), // Many emoji forcing StringBuilder growth
|
|
|
|
// 3. Malformed Percent Encodings
|
|
|
|
// Missing digits
|
|
"%",
|
|
"%%",
|
|
"%2",
|
|
"hello%",
|
|
"hello%2",
|
|
|
|
// Invalid hex digits
|
|
"%0G",
|
|
"%G0",
|
|
"%GG",
|
|
"%00%0G",
|
|
|
|
// Mixed case hex digits
|
|
"%2f",
|
|
"%2F",
|
|
"%2a",
|
|
"%2A",
|
|
|
|
// Multiple % characters
|
|
"%%%",
|
|
"%%%%",
|
|
"%2%3",
|
|
"%25%25",
|
|
|
|
// 4. Special Cases
|
|
|
|
// Mixed valid and invalid sequences
|
|
"valid%20invalid%GGvalid%20",
|
|
"%20%FF%20",
|
|
|
|
// Boundary conditions with valid/invalid sequences
|
|
"a".repeat(15) + "%GG",
|
|
"a".repeat(16) + "%GG",
|
|
"a".repeat(31) + "%GG",
|
|
|
|
// Edge cases around StringBuilder capacity
|
|
("valid%20" + "a".repeat(60)).repeat(100),
|
|
|
|
// UTF-8 edge cases
|
|
"%F4%8F%BF%BF", // U+10FFFF (highest valid codepoint)
|
|
"%F4%90%80%80", // Above U+10FFFF (invalid)
|
|
|
|
// Complex mixed scenarios
|
|
"hello%20%E2%82%AC%F0%9F%98%80world", // ASCII + space + euro + emoji
|
|
"%E2%82%AC".repeat(100) + "%F0%9F%98%80".repeat(100), // Alternating 3-byte and 4-byte sequences
|
|
];
|
|
|
|
describe("decodeURIComponentSIMD", () => {
|
|
for (const input of inputs) {
|
|
it(`should decode ${input}`, () => {
|
|
const encoded = encodeURIComponent(input);
|
|
const decoded = decodeURIComponentSIMD(encoded);
|
|
expect(decoded).toBe(decodeURIComponent(encoded));
|
|
});
|
|
}
|
|
});
|
|
|
|
describe("decodeURIComponentSIMD - Additional Tests", () => {
|
|
// Test error handling
|
|
for (const input of additionalInputs) {
|
|
it(`should handle ${input} without crashing`, () => {
|
|
try {
|
|
const decoded = decodeURIComponentSIMD(input);
|
|
// Some inputs are invalid, but shouldn't crash
|
|
if (decoded !== undefined) {
|
|
// For valid inputs, compare with native implementation
|
|
try {
|
|
const expected = decodeURIComponent(input);
|
|
expect(decoded).toBe(expected);
|
|
} catch (e) {
|
|
// Native implementation threw, our implementation should too
|
|
expect(() => decodeURIComponentSIMD(input)).toThrow();
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// If it throws, make sure native implementation also throws
|
|
expect(() => decodeURIComponent(input)).toThrow();
|
|
}
|
|
});
|
|
}
|
|
});
|
|
|
|
describe("decodeURIComponentSIMD edge cases", () => {
|
|
it("should handle cursor advancement correctly with invalid hex", () => {
|
|
// This test would fail because of the cursor advancement bug
|
|
// When it sees %GG, it only advances by 1 instead of 3, causing
|
|
// the GG to be treated as literal characters
|
|
expect(decodeURIComponentSIMD("%GG%20test")).toBe(String.fromCodePoint(0xfffd) + " " + "test");
|
|
});
|
|
|
|
it("should handle multiple invalid sequences consecutively", () => {
|
|
// Similar cursor advancement issue
|
|
expect(decodeURIComponentSIMD("%ZZ%XX%YY")).toBe(String.fromCodePoint(0xfffd).repeat(3));
|
|
});
|
|
|
|
it("should handle incomplete sequences at SIMD boundaries", () => {
|
|
// Create a string that puts a % character right at the SIMD boundary
|
|
// then follow it with invalid hex digits
|
|
const prefix = "a".repeat(15); // 15 bytes to align the % at boundary
|
|
expect(decodeURIComponentSIMD(prefix + "%GG")).toBe(prefix + String.fromCodePoint(0xfffd));
|
|
});
|
|
|
|
it("should handle mixed valid/invalid sequences at SIMD boundaries", () => {
|
|
// This combines SIMD boundary alignment with the cursor advancement bug
|
|
const prefix = "a".repeat(15);
|
|
expect(decodeURIComponentSIMD(prefix + "%GG%20%HH%20")).toBe(
|
|
prefix + String.fromCodePoint(0xfffd) + " " + String.fromCodePoint(0xfffd) + " ",
|
|
);
|
|
});
|
|
|
|
it("should handle large sequences of invalid encodings", () => {
|
|
// This would really expose the cursor advancement issue
|
|
const input = "%GG".repeat(1000);
|
|
// it should be full of unicode replacement characters
|
|
expect(decodeURIComponentSIMD(input).length).toBe(String.fromCodePoint(0xfffd).repeat(1000).length);
|
|
});
|
|
|
|
it("should handle invalid sequences followed by valid UTF-8", () => {
|
|
// This combines the cursor advancement bug with UTF-8 decoding
|
|
expect(decodeURIComponentSIMD("%GG%F0%9F%98%80")).toBe(
|
|
// replacement + replacement + smiley
|
|
String.fromCodePoint(0xfffd) + "😀",
|
|
);
|
|
});
|
|
});
|