bun.sh/test/js/bun/http/decodeURIComponentSIMD.test.ts

import { decodeURIComponentSIMD } from "bun:internal-for-testing";
import { describe, expect, it } from "bun:test";

const inputs = [
  "hello world",
  "hello world  ",
  " hello world",
  "!@#$%^&*()",
  "1234567890",
  "abcdefghijklmnopqrstuvwxyz",
  "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
  "こんにちは",
  "你好",
  "안녕하세요",
  "مرحبا",
  "שָׁלוֹם",
  "🌍🌎🌏",
  "👨‍👩‍👧‍👦",
  "🇺🇸🇯🇵🇰🇷",
  "https://example.com/path?param=value",
  "user@example.com",
  "path/to/file.txt",
  "C:\\Windows\\System32",
  "<script>alert('xss')</script>",
  "SELECT * FROM users;",
  "{}[]|\\",
  "     ",
  "",
  "a".repeat(1000),
  "🌟".repeat(100),
  "hello\nworld",
  "hello\tworld",
  "hello\rworld",
  "hello\\world",
  'hello"world',
  "hello'world",
  "hello`world",
  "hello/world",
  "hello?world",
  "hello=world",
  "hello&world",
  "hello+world",
  "hello%20world",
  "hello%2Fworld",
  "hello%3Fworld",
  "hello%3Dworld",
  "hello%26world",
  "hello%2Bworld",
  "hello%25world",
  "hello%23world",
  "hello%40world",
  "hello%21world",
  "hello%24world",
  "hello%2Cworld",
  "hello%3Bworld",
  "hello%3Aworld",
  "hello%5Bworld",
  "hello%5Dworld",
  "hello%7Bworld",
  "hello%7Dworld",
  "hello%7Cworld",
  "hello%5Cworld",
  "hello%22world",
  "hello%27world",
  "hello%60world",
  "hello%3Cworld",
  "hello%3Eworld",
  "hello%2Eworld",
  "hello%2Dworld",
  "hello%5Fworld",
  "hello%7Eworld",
  "hello%2Aworld",
  "hello%2Bworld",
  "hello%2Cworld",
  "hello%2Fworld",
  "hello%3Aworld",
  "hello%3Bworld",
  "hello%3Cworld",
  "hello%3Dworld",
  "hello%3Eworld",
  "hello%3Fworld",
  "hello%40world",
  "hello%5Bworld",
  "hello%5Cworld",
  "hello%5Dworld",
  "hello%5Eworld",
  "hello%5Fworld",
  "hello%60world",
  "hello%7Bworld",
  "hello%7Cworld",
  "hello%7Dworld",
  "hello%7Eworld",
  "hello%7Fworld",
  "hello%80world",
  "hello%FFworld",
  "hello%F0%9F%8C%9F",
  "hello%F0%9F%98%80",
  "hello%F0%9F%98%81",
  "hello%F0%9F%98%82",
  "hello%F0%9F%98%83",
  "hello%F0%9F%98%84",
  "hello%F0%9F%98%85",
  "hello%F0%9F%98%86",
  "hello%F0%9F%98%87",
  "hello%F0%9F%98%88",
  "hello%F0%9F%98%89",
  "hello%F0%9F%98%8A",
  "hello%F0%9F%98%8B",
  "hello%F0%9F%98%8C",
  "hello%F0%9F%98%8D",
  "hello%F0%9F%98%8E",
  "hello%F0%9F%98%8F",
  "hello%F0%9F%98%90",
  "hello%F0%9F%98%91",
  // Test 16-byte boundary cases
  "1234567890123456%20", // % at byte 16
  "123456789012345%20a", // % at byte 15
  "12345678901234%20ab", // % at byte 14
  "1234567890123%20abc", // % at byte 13
  "123456789012%20abcd", // % at byte 12
  "12345678901%20abcde", // % at byte 11
  "1234567890%20abcdef", // % at byte 10
  "123456789%20abcdefg", // % at byte 9
  "12345678%20abcdefgh", // % at byte 8
  "1234567%20abcdefghi", // % at byte 7
  "123456%20abcdefghij", // % at byte 6
  "12345%20abcdefghijk", // % at byte 5
  "1234%20abcdefghijkl", // % at byte 4
  "123%20abcdefghijklm", // % at byte 3
  "12%20abcdefghijklmn", // % at byte 2
  "1%20abcdefghijklmno", // % at byte 1
  "%20abcdefghijklmnop", // % at byte 0
  "1234567890123456%20abcd", // Multiple of 16 before %
  "12345678901234567890%20", // Multiple of 16 + 4 before %
  "123456789012345678901234567890%20", // Multiple of 16 + 14 before %

  // Additional boundary tests with different encoded characters
  "1234567890123456%2B", // + at boundary
  "1234567890123456%3D", // = at boundary
  "1234567890123456%2F", // / at boundary
  "1234567890123456%3F", // ? at boundary
  "1234567890123456%26", // & at boundary

  // Multiple percent encodings near boundaries
  "12345678901234%20%20", // Two spaces at boundary
  "1234567890123%20%20a", // Two spaces near boundary
  "123456789012%20%20ab", // Two spaces near boundary

  // UTF-8 multi-byte sequences at boundaries
  "1234567890123456%F0%9F%98%80", // Emoji at boundary
  "12345678901234%F0%9F%98%80ab", // Emoji near boundary
  "123456789012%F0%9F%98%80abcd", // Emoji near boundary

  // Mixed ASCII and encoded characters
  "1234567890123456%20ABC%20",
  "1234567890123456%20%F0%9F%98%80",
  "12345678901234%20%F0%9F%98%80ab",

  // Multiple boundaries in sequence
  "1234567890123456%201234567890123456%20",
  "1234567890123456%201234567890123456%2B",
  "1234567890123456%201234567890123456%3D",

  // Testing with different encoded characters at boundaries
  "1234567890123456%251234567890123456%24",
  "1234567890123456%261234567890123456%23",
  "1234567890123456%271234567890123456%22",

  // Testing with invalid sequences at boundaries
  "1234567890123456%", // Incomplete percent encoding at boundary
  "1234567890123456%2", // Incomplete percent encoding at boundary
  "1234567890123456%G0", // Invalid hex digit at boundary

  // Testing with multiple encodings in quick succession
  "12345678901234%20%20%20%20",
  "1234567890123%20%20%20%20a",
  "123456789012%20%20%20%20ab",

  // Testing with mixed valid and invalid sequences
  "1234567890123456%20%GG%20",
  "1234567890123456%20%%20",
  "1234567890123456%20%2%20",

  // Testing boundaries with special characters
  "1234567890123456%0A", // newline
  "1234567890123456%0D", // carriage return
  "1234567890123456%09", // tab

  // Testing with URL-specific characters
  "1234567890123456%3A%2F%2F", // ://
  "1234567890123456%3F%3D%26", // ?=&
  "1234567890123456%23%40%21", // #@!

  // Testing with multiple boundaries and mixed content
  "1234567890123456%201234567890123456%F0%9F%98%80",
  "1234567890123456%2B1234567890123456%20%F0%9F%98%80",
  "1234567890123456%3D1234567890123456%20ABC%20",

  // Edge cases with repeated patterns
  "1234567890123456%20%20%20%201234567890123456%20%20%20%20",
  "1234567890123456%25%25%25%251234567890123456%25%25%25%25",
  "1234567890123456%2B%2B%2B%2B1234567890123456%2B%2B%2B%2B",
];

// Additional test cases for production quality URI component decoder
const additionalInputs = [
  // 1. Invalid UTF-8 Sequences

  // Incomplete UTF-8 sequences
  "%E2%82", // Incomplete euro symbol
  "%F0%90", // Incomplete 4-byte sequence
  "%C2", // Incomplete 2-byte sequence

  // Overlong encodings
  "%C0%AF", // Overlong '/' (should be %2F)
  "%E0%80%AF", // Overlong '/' (3-byte)
  "%F0%80%80%AF", // Overlong '/' (4-byte)

  // Invalid UTF-8 continuation bytes
  "%C2%C0", // Invalid continuation
  "%E2%82%C0", // Invalid continuation in 3-byte sequence
  "%F0%90%80%C0", // Invalid continuation in 4-byte sequence

  // UTF-16 surrogate halves encoded in UTF-8
  "%ED%A0%80", // Lead surrogate U+D800
  "%ED%BE%80", // Trail surrogate U+DFFF
  "%ED%A0%80%ED%B0%80", // Surrogate pair encoded in UTF-8

  // 2. Memory and Buffer Edge Cases

  // SIMD boundary alignment
  "a".repeat(15) + "%20", // 15 chars + encoded char
  "a".repeat(16) + "%20", // 16 chars + encoded char
  "a".repeat(31) + "%20", // 31 chars + encoded char
  "a".repeat(32) + "%20", // 32 chars + encoded char

  // Large strings
  "a".repeat(1024) + "%20" + "b".repeat(1024),
  "%20".repeat(1000), // Many encoded characters
  ("a".repeat(15) + "%20").repeat(100), // Repeating pattern at SIMD boundary

  // StringBuilder reallocation
  "%F0%9F%98%80".repeat(1000), // Many emoji forcing StringBuilder growth

  // 3. Malformed Percent Encodings

  // Missing digits
  "%",
  "%%",
  "%2",
  "hello%",
  "hello%2",

  // Invalid hex digits
  "%0G",
  "%G0",
  "%GG",
  "%00%0G",

  // Mixed case hex digits
  "%2f",
  "%2F",
  "%2a",
  "%2A",

  // Multiple % characters
  "%%%",
  "%%%%",
  "%2%3",
  "%25%25",

  // 4. Special Cases

  // Mixed valid and invalid sequences
  "valid%20invalid%GGvalid%20",
  "%20%FF%20",

  // Boundary conditions with valid/invalid sequences
  "a".repeat(15) + "%GG",
  "a".repeat(16) + "%GG",
  "a".repeat(31) + "%GG",

  // Edge cases around StringBuilder capacity
  ("valid%20" + "a".repeat(60)).repeat(100),

  // UTF-8 edge cases
  "%F4%8F%BF%BF", // U+10FFFF (highest valid codepoint)
  "%F4%90%80%80", // Above U+10FFFF (invalid)

  // Complex mixed scenarios
  "hello%20%E2%82%AC%F0%9F%98%80world", // ASCII + space + euro + emoji
  "%E2%82%AC".repeat(100) + "%F0%9F%98%80".repeat(100), // Alternating 3-byte and 4-byte sequences
];

describe("decodeURIComponentSIMD", () => {
  for (const input of inputs) {
    it(`should decode ${input}`, () => {
      const encoded = encodeURIComponent(input);
      const decoded = decodeURIComponentSIMD(encoded);
      expect(decoded).toBe(decodeURIComponent(encoded));
    });
  }
});

describe("decodeURIComponentSIMD - Additional Tests", () => {
  // Test error handling
  for (const input of additionalInputs) {
    it(`should handle ${input} without crashing`, () => {
      try {
        const decoded = decodeURIComponentSIMD(input);
        // Some inputs are invalid, but shouldn't crash
        if (decoded !== undefined) {
          // For valid inputs, compare with native implementation
          try {
            const expected = decodeURIComponent(input);
            expect(decoded).toBe(expected);
          } catch (e) {
            // Native implementation threw, our implementation should too
            expect(() => decodeURIComponentSIMD(input)).toThrow();
          }
        }
      } catch (e) {
        // If it throws, make sure native implementation also throws
        expect(() => decodeURIComponent(input)).toThrow();
      }
    });
  }
});

describe("decodeURIComponentSIMD edge cases", () => {
  it("should handle cursor advancement correctly with invalid hex", () => {
    // This test would fail because of the cursor advancement bug
    // When it sees %GG, it only advances by 1 instead of 3, causing
    // the GG to be treated as literal characters
    expect(decodeURIComponentSIMD("%GG%20test")).toBe(String.fromCodePoint(0xfffd) + " " + "test");
  });

  it("should handle multiple invalid sequences consecutively", () => {
    // Similar cursor advancement issue
    expect(decodeURIComponentSIMD("%ZZ%XX%YY")).toBe(String.fromCodePoint(0xfffd).repeat(3));
  });

  it("should handle incomplete sequences at SIMD boundaries", () => {
    // Create a string that puts a % character right at the SIMD boundary
    // then follow it with invalid hex digits
    const prefix = "a".repeat(15); // 15 bytes to align the % at boundary
    expect(decodeURIComponentSIMD(prefix + "%GG")).toBe(prefix + String.fromCodePoint(0xfffd));
  });

  it("should handle mixed valid/invalid sequences at SIMD boundaries", () => {
    // This combines SIMD boundary alignment with the cursor advancement bug
    const prefix = "a".repeat(15);
    expect(decodeURIComponentSIMD(prefix + "%GG%20%HH%20")).toBe(
      prefix + String.fromCodePoint(0xfffd) + " " + String.fromCodePoint(0xfffd) + " ",
    );
  });

  it("should handle large sequences of invalid encodings", () => {
    // This would really expose the cursor advancement issue
    const input = "%GG".repeat(1000);
    // it should be full of unicode replacement characters
    expect(decodeURIComponentSIMD(input).length).toBe(String.fromCodePoint(0xfffd).repeat(1000).length);
  });

  it("should handle invalid sequences followed by valid UTF-8", () => {
    // This combines the cursor advancement bug with UTF-8 decoding
    expect(decodeURIComponentSIMD("%GG%F0%9F%98%80")).toBe(
      // replacement + replacement + smiley
      String.fromCodePoint(0xfffd) + "😀",
    );
  });
});