bun.sh/test/js/web/encoding/text-encoder-stream.test.ts

import { expect, test } from "bun:test";
import { readableStreamFromArray } from "harness";

// META: global=window,worker
// META: script=resources/readable-stream-from-array.js
// META: script=resources/readable-stream-to-array.js

const inputString = "I \u{1F499} streams";
const expectedOutputBytes = [0x49, 0x20, 0xf0, 0x9f, 0x92, 0x99, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x73];
// This is a character that must be represented in two code units in a string,
// ie. it is not in the Basic Multilingual Plane.
const astralCharacter = "\u{1F499}"; // BLUE HEART
const astralCharacterEncoded = [0xf0, 0x9f, 0x92, 0x99];
const leading = astralCharacter[0];
const trailing = astralCharacter[1];
const replacementEncoded = [0xef, 0xbf, 0xbd];

// These tests assume that the implementation correctly classifies leading and
// trailing surrogates and treats all the code units in each set equivalently.

const testCases = [
  {
    input: [inputString],
    output: [expectedOutputBytes],
    description: "encoding one string of UTF-8 should give one complete chunk",
  },
  {
    input: [leading, trailing],
    output: [astralCharacterEncoded],
    description: "a character split between chunks should be correctly encoded",
  },
  {
    input: [leading, trailing + astralCharacter],
    output: [astralCharacterEncoded.concat(astralCharacterEncoded)],
    description: "a character following one split between chunks should be " + "correctly encoded",
  },
  {
    input: [leading, trailing + leading, trailing],
    output: [astralCharacterEncoded, astralCharacterEncoded],
    description: "two consecutive astral characters each split down the " + "middle should be correctly reassembled",
  },
  {
    input: [leading, trailing + leading + leading, trailing],
    output: [astralCharacterEncoded.concat(replacementEncoded), astralCharacterEncoded],
    description:
      "two consecutive astral characters each split down the " +
      "middle with an invalid surrogate in the middle should be correctly " +
      "encoded",
  },
  {
    input: [leading],
    output: [replacementEncoded],
    description: "a stream ending in a leading surrogate should emit a " + "replacement character as a final chunk",
  },
  {
    input: [leading, astralCharacter],
    output: [replacementEncoded.concat(astralCharacterEncoded)],
    description:
      "an unmatched surrogate at the end of a chunk followed by " +
      "an astral character in the next chunk should be replaced with " +
      "the replacement character at the start of the next output chunk",
  },
  {
    input: [leading, "A"],
    output: [replacementEncoded.concat([65])],
    description:
      "an unmatched surrogate at the end of a chunk followed by " +
      "an ascii character in the next chunk should be replaced with " +
      "the replacement character at the start of the next output chunk",
  },
  {
    input: [leading, leading, trailing],
    output: [replacementEncoded, astralCharacterEncoded],
    description:
      "an unmatched surrogate at the end of a chunk followed by " +
      "a plane 1 character split into two chunks should result in " +
      "the encoded plane 1 character appearing in the last output chunk",
  },
  {
    input: [leading, leading],
    output: [replacementEncoded, replacementEncoded],
    description: "two leading chunks should result in two replacement " + "characters",
  },
  {
    input: [leading + leading, trailing],
    output: [replacementEncoded, astralCharacterEncoded],
    description: "a non-terminal unpaired leading surrogate should " + "immediately be replaced",
  },
  {
    input: [trailing, astralCharacter],
    output: [replacementEncoded, astralCharacterEncoded],
    description: "a terminal unpaired trailing surrogate should " + "immediately be replaced",
  },
  {
    input: [leading, "", trailing],
    output: [astralCharacterEncoded],
    description: "a leading surrogate chunk should be carried past empty chunks",
  },
  {
    input: [leading, ""],
    output: [replacementEncoded],
    description: "a leading surrogate chunk should error when it is clear " + "it didn't form a pair",
  },
  {
    input: [""],
    output: [],
    description: "an empty string should result in no output chunk",
  },
  {
    input: ["", inputString],
    output: [expectedOutputBytes],
    description: "a leading empty chunk should be ignored",
  },
  {
    input: [inputString, ""],
    output: [expectedOutputBytes],
    description: "a trailing empty chunk should be ignored",
  },
  {
    input: ["A"],
    output: [[65]],
    description: "a plain ASCII chunk should be converted",
  },
  {
    input: ["\xff"],
    output: [[195, 191]],
    description: "characters in the ISO-8859-1 range should be encoded correctly",
  },
];

for (const { input, output, description } of testCases) {
  test(description, async () => {
    const inputStream = readableStreamFromArray(input);
    const outputStream = inputStream.pipeThrough(new TextEncoderStream());
    const chunkArray = await Bun.readableStreamToArray(outputStream);
    expect(chunkArray.length, "number of chunks should match").toBe(output.length);
    for (let i = 0; i < output.length; ++i) {
      expect(chunkArray[i].constructor).toBe(Uint8Array);
      expect(chunkArray[i].length).toBe(output[i].length);
      for (let j = 0; j < output[i].length; ++j) {
        expect(chunkArray[i][j]).toBe(output[i][j]);
      }
    }
  });
}