Files
bun.sh/src/bun.js/bindings/webcore/URLPatternTokenizer.cpp
Jarred Sumner 0305f3d4d2 feat(url): implement URLPattern API (#25168)
## Summary

Implements the [URLPattern Web
API](https://developer.mozilla.org/en-US/docs/Web/API/URLPattern) based
on WebKit's implementation. URLPattern provides declarative pattern
matching for URLs, similar to how regular expressions work for strings.

### Features

- **Constructor**: Create patterns from strings or `URLPatternInit`
dictionaries
- **`test()`**: Check if a URL matches the pattern (returns boolean)
- **`exec()`**: Extract matched groups from a URL (returns
`URLPatternResult` or null)
- **Pattern properties**: `protocol`, `username`, `password`,
`hostname`, `port`, `pathname`, `search`, `hash`
- **`hasRegExpGroups`**: Detect if the pattern uses custom regular
expressions

### Example Usage

```js
// Match URLs with a user ID parameter
const pattern = new URLPattern({ pathname: '/users/:id' });

pattern.test('https://example.com/users/123'); // true
pattern.test('https://example.com/posts/456'); // false

const result = pattern.exec('https://example.com/users/123');
console.log(result.pathname.groups.id); // "123"

// Wildcard matching
const filesPattern = new URLPattern({ pathname: '/files/*' });
const match = filesPattern.exec('https://example.com/files/image.png');
console.log(match.pathname.groups[0]); // "image.png"
```

## Implementation Notes

- Adapted from WebKit's URLPattern implementation
- Modified JS bindings to work with Bun's infrastructure (simpler
`convertDictionary` patterns, WTF::Variant handling)
- Added IsoSubspaces for proper GC integration

## Test Plan

- [x] 408 tests from Web Platform Tests pass
- [x] Tests fail with system Bun (URLPattern not defined), pass with
debug build
- [x] Manual testing of basic functionality

Fixes https://github.com/oven-sh/bun/issues/2286

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2025-11-28 00:04:30 -08:00

274 lines
9.0 KiB
C++

/*
* Copyright (C) 2024 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "URLPatternTokenizer.h"
#include "ExceptionOr.h"
#include "URLPatternParser.h"
#include <unicode/utf16.h>
#include <wtf/text/MakeString.h>
namespace WebCore {
namespace URLPatternUtilities {
bool Token::isNull() const
{
if (!index) {
ASSERT(value.isNull());
return true;
}
return false;
}
// https://urlpattern.spec.whatwg.org/#get-the-next-code-point
void Tokenizer::getNextCodePoint()
{
m_codepoint = m_input[m_nextIndex++];
if (m_input.is8Bit() || !U16_IS_LEAD(m_codepoint) || m_nextIndex >= m_input.length())
return;
auto next = m_input[m_nextIndex];
if (!U16_IS_TRAIL(next))
return;
m_nextIndex++;
m_codepoint = U16_GET_SUPPLEMENTARY(m_codepoint, next);
}
// https://urlpattern.spec.whatwg.org/#seek-and-get-the-next-code-point
void Tokenizer::seekNextCodePoint(size_t index)
{
m_nextIndex = index;
getNextCodePoint();
}
// https://urlpattern.spec.whatwg.org/#add-a-token
void Tokenizer::addToken(TokenType currentType, size_t nextPosition, size_t valuePosition, size_t valueLength)
{
m_tokenList.append(Token { currentType, m_index, m_input.substring(valuePosition, valueLength) });
m_index = nextPosition;
}
// https://urlpattern.spec.whatwg.org/#add-a-token-with-default-length
void Tokenizer::addToken(TokenType currentType, size_t nextPosition, size_t valuePosition)
{
addToken(currentType, nextPosition, valuePosition, nextPosition - valuePosition);
}
// https://urlpattern.spec.whatwg.org/#add-a-token-with-default-position-and-length
void Tokenizer::addToken(TokenType currentType)
{
addToken(currentType, m_nextIndex, m_index);
}
// https://urlpattern.spec.whatwg.org/#process-a-tokenizing-error
ExceptionOr<void> Tokenizer::processTokenizingError(size_t nextPosition, size_t valuePosition, const String& callerErrorInfo)
{
if (m_policy == TokenizePolicy::Strict)
return Exception { ExceptionCode::TypeError, callerErrorInfo };
ASSERT(m_policy == TokenizePolicy::Lenient);
addToken(TokenType::InvalidChar, nextPosition, valuePosition);
return {};
}
Tokenizer::Tokenizer(StringView input, TokenizePolicy tokenizerPolicy)
: m_input(input)
, m_policy(tokenizerPolicy)
{
}
// https://urlpattern.spec.whatwg.org/#tokenize
ExceptionOr<Vector<Token>> Tokenizer::tokenize()
{
ExceptionOr<void> maybeException;
while (m_index < m_input.length()) {
if (m_policy == TokenizePolicy::Strict && maybeException.hasException())
return maybeException.releaseException();
seekNextCodePoint(m_index);
if (m_codepoint == '*') {
addToken(TokenType::Asterisk);
continue;
}
if (m_codepoint == '+' || m_codepoint == '?') {
addToken(TokenType::OtherModifier);
continue;
}
if (m_codepoint == '\\') {
if (m_index == m_input.length() - 1) {
maybeException = processTokenizingError(m_nextIndex, m_index, "No character is provided after escape."_s);
continue;
}
auto escapedIndex = m_nextIndex;
getNextCodePoint();
addToken(TokenType::EscapedChar, m_nextIndex, escapedIndex);
continue;
}
if (m_codepoint == '{') {
addToken(TokenType::Open);
continue;
}
if (m_codepoint == '}') {
addToken(TokenType::Close);
continue;
}
if (m_codepoint == ':') {
auto namePosition = m_nextIndex;
auto nameStart = namePosition;
while (namePosition < m_input.length()) {
seekNextCodePoint(namePosition);
bool isValidCodepoint = isValidNameCodepoint(m_codepoint, namePosition == nameStart ? IsFirst::Yes : IsFirst::No);
if (!isValidCodepoint)
break;
namePosition = m_nextIndex;
}
if (namePosition <= nameStart) {
maybeException = processTokenizingError(nameStart, m_index, makeString("Name position "_s, String::number(namePosition), " is less than name start "_s, String::number(nameStart)));
continue;
}
addToken(TokenType::Name, namePosition, nameStart);
continue;
}
if (m_codepoint == '(') {
int depth = 1;
auto regexPosition = m_nextIndex;
auto regexStart = regexPosition;
bool hasError = false;
while (regexPosition < m_input.length()) {
seekNextCodePoint(regexPosition);
if (!isASCII(m_codepoint)) {
maybeException = processTokenizingError(regexStart, m_index, "Current codepoint is not ascii"_s);
hasError = true;
break;
}
if (regexPosition == regexStart && m_codepoint == '?') {
maybeException = processTokenizingError(regexStart, m_index, "Regex cannot start with modifier."_s);
hasError = true;
break;
}
if (m_codepoint == '\\') {
if (regexPosition == m_input.length() - 1) {
maybeException = processTokenizingError(regexStart, m_index, "No character is provided after escape."_s);
hasError = true;
break;
}
getNextCodePoint();
if (!isASCII(m_codepoint)) {
maybeException = processTokenizingError(regexStart, m_index, "Current codepoint is not ascii"_s);
hasError = true;
break;
}
regexPosition = m_nextIndex;
continue;
}
if (m_codepoint == ')') {
depth = depth - 1;
if (!depth) {
regexPosition = m_nextIndex;
break;
}
}
if (m_codepoint == '(') {
depth = depth + 1;
if (regexPosition == m_input.length() - 1) {
maybeException = processTokenizingError(regexStart, m_index, "No closing token is provided by end of string."_s);
hasError = true;
break;
}
int temporaryPosition = m_nextIndex;
getNextCodePoint();
if (m_codepoint != '?') {
maybeException = processTokenizingError(regexStart, m_index, "Required OtherModifier token is not provided in regex."_s);
hasError = true;
break;
}
m_nextIndex = temporaryPosition;
}
regexPosition = m_nextIndex;
}
if (hasError)
continue;
if (depth) {
maybeException = processTokenizingError(regexStart, m_index, "Current open token does not have a corresponding close token."_s);
continue;
}
auto regexLength = regexPosition - regexStart - 1;
if (!regexLength)
maybeException = processTokenizingError(regexStart, m_index, "Regex length is zero."_s);
addToken(TokenType::Regexp, regexPosition, regexStart, regexLength);
continue;
}
addToken(TokenType::Char);
}
addToken(TokenType::End, m_index, m_index);
return WTFMove(m_tokenList);
}
} // namespace URLPatternUtilities
} // namespace WebCore