mirror of
https://github.com/oven-sh/bun
synced 2026-02-10 10:58:56 +00:00
fix oniguruma regex character properties (#1528)
* fix for character properties * cleanup tests * cleanup comments * i - 2 >= 0
This commit is contained in:
@@ -63,14 +63,17 @@ static WTF::String to16Bit(JSValue jsValue, JSC::JSGlobalObject* globalObject, A
|
||||
return to16Bit(jsString, globalObject);
|
||||
}
|
||||
|
||||
static WTF::String extendMultibyteHexCharacters(const WTF::String& string)
|
||||
static WTF::String convertToOnigurumaSyntax(const WTF::String& string)
|
||||
{
|
||||
WTF::StringBuilder sb;
|
||||
uint32_t length = string.length();
|
||||
const UChar* characters = string.characters16();
|
||||
bool inCharacterClass = false;
|
||||
bool inCharacterProperty = false;
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
|
||||
// extend multibyte hex characters
|
||||
while (characters[i] == '\\') {
|
||||
if (i + 1 < length && characters[i + 1] == 'x') {
|
||||
if (i + 2 < length && isxdigit(characters[i + 2])) {
|
||||
@@ -95,6 +98,58 @@ static WTF::String extendMultibyteHexCharacters(const WTF::String& string)
|
||||
break;
|
||||
}
|
||||
|
||||
// convert character properties
|
||||
if (UNLIKELY(characters[i] == '{' && i - 2 >= 0 && (characters[i - 1] == 'p' || characters[i - 1] == 'P') && characters[i - 2] == '\\')) {
|
||||
sb.append(characters[i]);
|
||||
i += 1;
|
||||
if (i == length) {
|
||||
break;
|
||||
}
|
||||
|
||||
// handle negative
|
||||
if (characters[i] == '^') {
|
||||
sb.append(characters[i]);
|
||||
i += 1;
|
||||
if (i == length) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// could be \p{propName=propValue} or \p{propValue}.
|
||||
bool foundEquals = false;
|
||||
WTF::StringBuilder propName;
|
||||
while (characters[i] != '}') {
|
||||
if (characters[i] == '=') {
|
||||
foundEquals = true;
|
||||
i += 1;
|
||||
if (i == length) {
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (foundEquals) {
|
||||
sb.append(characters[i]);
|
||||
} else {
|
||||
propName.append(characters[i]);
|
||||
}
|
||||
|
||||
i += 1;
|
||||
if (i == length) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundEquals) {
|
||||
sb.append(propName.toString());
|
||||
}
|
||||
}
|
||||
|
||||
if (i >= length) {
|
||||
break;
|
||||
}
|
||||
|
||||
// escape brackets in character classes
|
||||
if (inCharacterClass) {
|
||||
// we know ']' will be escaped so there isn't a need to scan for the closing bracket
|
||||
if (characters[i] == '[' || characters[i] == ']') {
|
||||
@@ -518,13 +573,13 @@ JSC_DEFINE_HOST_FUNCTION(onigurumaRegExpProtoFuncCompile, (JSGlobalObject * glob
|
||||
return JSValue::encode({});
|
||||
}
|
||||
thisRegExp->setPatternString(regExpObject->patternString());
|
||||
patternStringExtended = extendMultibyteHexCharacters(thisRegExp->patternString());
|
||||
patternStringExtended = convertToOnigurumaSyntax(thisRegExp->patternString());
|
||||
thisRegExp->setFlagsString(regExpObject->flagsString());
|
||||
} else {
|
||||
WTF::String newPatternString = to16Bit(arg0, globalObject, "(?:)"_s);
|
||||
RETURN_IF_EXCEPTION(scope, {});
|
||||
|
||||
patternStringExtended = extendMultibyteHexCharacters(newPatternString);
|
||||
patternStringExtended = convertToOnigurumaSyntax(newPatternString);
|
||||
|
||||
WTF::String newFlagsString = to16Bit(arg1, globalObject, ""_s);
|
||||
RETURN_IF_EXCEPTION(scope, {});
|
||||
@@ -543,7 +598,7 @@ JSC_DEFINE_HOST_FUNCTION(onigurumaRegExpProtoFuncCompile, (JSGlobalObject * glob
|
||||
// for pattern syntax checking
|
||||
int errorCode = 0;
|
||||
OnigErrorInfo errorInfo = { 0 };
|
||||
regex_t* onigurumaRegExp = createOnigurumaRegExp(globalObject, extendMultibyteHexCharacters(thisRegExp->patternString()), thisRegExp->flagsString(), errorCode, errorInfo);
|
||||
regex_t* onigurumaRegExp = createOnigurumaRegExp(globalObject, convertToOnigurumaSyntax(thisRegExp->patternString()), thisRegExp->flagsString(), errorCode, errorInfo);
|
||||
if (errorCode != ONIG_NORMAL) {
|
||||
OnigUChar errorBuff[ONIG_MAX_ERROR_MESSAGE_LEN] = { 0 };
|
||||
int length = onig_error_code_to_str(errorBuff, errorCode, &errorInfo);
|
||||
@@ -588,7 +643,7 @@ JSC_DEFINE_HOST_FUNCTION(onigurumaRegExpProtoFuncTest, (JSGlobalObject * globalO
|
||||
|
||||
int errorCode = 0;
|
||||
OnigErrorInfo errorInfo = { 0 };
|
||||
regex_t* onigurumaRegExp = createOnigurumaRegExp(globalObject, extendMultibyteHexCharacters(thisValue->patternString()), thisValue->flagsString(), errorCode, errorInfo);
|
||||
regex_t* onigurumaRegExp = createOnigurumaRegExp(globalObject, convertToOnigurumaSyntax(thisValue->patternString()), thisValue->flagsString(), errorCode, errorInfo);
|
||||
if (errorCode != ONIG_NORMAL) {
|
||||
OnigUChar errorBuff[ONIG_MAX_ERROR_MESSAGE_LEN] = { 0 };
|
||||
int length = onig_error_code_to_str(errorBuff, errorCode, &errorInfo);
|
||||
@@ -674,7 +729,7 @@ JSC_DEFINE_HOST_FUNCTION(onigurumaRegExpProtoFuncExec, (JSGlobalObject * globalO
|
||||
|
||||
int errorCode = 0;
|
||||
OnigErrorInfo errorInfo = { 0 };
|
||||
regex_t* onigurumaRegExp = createOnigurumaRegExp(globalObject, extendMultibyteHexCharacters(thisValue->patternString()), thisValue->flagsString(), errorCode, errorInfo);
|
||||
regex_t* onigurumaRegExp = createOnigurumaRegExp(globalObject, convertToOnigurumaSyntax(thisValue->patternString()), thisValue->flagsString(), errorCode, errorInfo);
|
||||
if (errorCode != ONIG_NORMAL) {
|
||||
OnigUChar errorBuff[ONIG_MAX_ERROR_MESSAGE_LEN] = { 0 };
|
||||
int length = onig_error_code_to_str(errorBuff, errorCode, &errorInfo);
|
||||
@@ -859,7 +914,7 @@ static JSC::EncodedJSValue constructOrCall(Zig::GlobalObject* globalObject, JSVa
|
||||
// create for pattern compilation errors, but need to create another for each exec/test
|
||||
int errorCode = 0;
|
||||
OnigErrorInfo errorInfo = { 0 };
|
||||
regex_t* onigurumaRegExp = createOnigurumaRegExp(globalObject, extendMultibyteHexCharacters(patternString), flagsString, errorCode, errorInfo);
|
||||
regex_t* onigurumaRegExp = createOnigurumaRegExp(globalObject, convertToOnigurumaSyntax(patternString), flagsString, errorCode, errorInfo);
|
||||
if (errorCode != ONIG_NORMAL) {
|
||||
OnigUChar errorBuff[ONIG_MAX_ERROR_MESSAGE_LEN] = { 0 };
|
||||
int length = onig_error_code_to_str(errorBuff, errorCode, &errorInfo);
|
||||
|
||||
@@ -3,6 +3,52 @@ import { OnigurumaRegExp } from "bun";
|
||||
import { expect, it, test } from "bun:test";
|
||||
import { gc as gcTrace } from "./gc";
|
||||
|
||||
it("character property scripts", () => {
|
||||
// oniguruma does not support \p{Script=<script value>}
|
||||
// they are converted to \p{<script value>} internally
|
||||
const sentence = "A ticket to 大阪 costs ¥2000 👌.";
|
||||
|
||||
const g0 = OnigurumaRegExp("\\p{Emoji_Presentation}", "gu");
|
||||
const s0 = sentence.match(g0);
|
||||
const g1 = RegExp("\\p{Emoji_Presentation}", "gu");
|
||||
const s1 = sentence.match(g1);
|
||||
for (const [i, s] of s0.entries()) {
|
||||
expect(s === s1[i]).toBe(true);
|
||||
}
|
||||
|
||||
const g2 = OnigurumaRegExp("\\P{Script_Extensions=Latin}+", "gu");
|
||||
const s2 = sentence.match(g2);
|
||||
const g3 = RegExp("\\P{Script_Extensions=Latin}+", "gu");
|
||||
const s3 = sentence.match(g3);
|
||||
for (const [i, s] of s2.entries()) {
|
||||
expect(s === s3[i]).toBe(true);
|
||||
}
|
||||
|
||||
const g4 = OnigurumaRegExp("\\p{Sc}|\\p{P}", "gu");
|
||||
const s4 = sentence.match(g4);
|
||||
const g5 = RegExp("\\p{Sc}|\\p{P}", "gu");
|
||||
const s5 = sentence.match(g5);
|
||||
for (const [i, s] of s4.entries()) {
|
||||
expect(s === s5[i]).toBe(true);
|
||||
}
|
||||
|
||||
expect("٢".match(new RegExp("\\p{Script=Thaana}", "u"))).toBe(null);
|
||||
expect("٢".match(new RegExp("\\p{Script_Extensions=Thaana}", "u"))![0]).toBe(
|
||||
"٢",
|
||||
);
|
||||
|
||||
expect("٢".match(new OnigurumaRegExp("\\p{Thaana}", "u"))).toBe(null);
|
||||
expect(
|
||||
"٢".match(new OnigurumaRegExp("\\p{Script_Extensions=Thaana}", "u")),
|
||||
).toBe(null);
|
||||
|
||||
let r1 = new OnigurumaRegExp(
|
||||
"<\\/(?<fullName>(?<name>[-_\\p{Letter}\\p{Number}\\p{script=Deva}\\p{sc=Thai}]{1,32})(?: (?<subcommandOrGroup>[-_\\p{Letter}\\p{Number}\\p{sc=Deva}\\p{sc=Thai}]{1,32}))?(?: (?<subcommand>[-_\\p{Letter}\\p{Number}\\p{sc=Deva}\\p{sc=Thai}]{1,32}))?):(?<id>\\d{17,20})>",
|
||||
"",
|
||||
);
|
||||
expect(r1 !== null).toBe(true);
|
||||
});
|
||||
|
||||
it("repeated match and exec calls", () => {
|
||||
for (let i = 0; i < 20000; i++) {
|
||||
let r1 = new OnigurumaRegExp("//.+?/[^?]+", "sg");
|
||||
|
||||
Reference in New Issue
Block a user