regexp-generator: Implement downstream changes

The optimizations from commit e558b29b were never incorporated into the upstream test generator. This does so now. As far as I can tell, the changes to the Unicode ranges are purely cosmetic. Some are formatted as 6-digit hex numbers instead of 4-digit. Others move the low-surrogates range 0xDC00-0xDCFF to the beginning of the array, but the union of the ranges is still the same.
2025-12-16 02:03:36 +01:00 · 2024-11-01 16:49:48 -07:00 · 2024-11-01 16:49:48 -07:00 · 07ddc3b41b
commit 07ddc3b41b
parent 879326855b
10 changed files with 96 additions and 32 deletions
--- a/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape-flags-u.js
+++ b/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape-flags-u.js
@ -40,7 +40,7 @@ includes: [regExpUtils.js]
 const str = buildString({
    loneCodePoints: [],
    ranges: [
-        [0x0030, 0x0039],
+        [0x000030, 0x000039],
    ],
 });

--- a/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape-plus-quantifier-flags-u.js
+++ b/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape-plus-quantifier-flags-u.js
@ -40,7 +40,7 @@ includes: [regExpUtils.js]
 const str = buildString({
    loneCodePoints: [],
    ranges: [
-        [0x0030, 0x0039],
+        [0x000030, 0x000039],
    ],
 });

--- a/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape-plus-quantifier.js
+++ b/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape-plus-quantifier.js
@ -40,7 +40,7 @@ includes: [regExpUtils.js]
 const str = buildString({
    loneCodePoints: [],
    ranges: [
-        [0x0030, 0x0039],
+        [0x000030, 0x000039],
    ],
 });

--- a/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape.js
+++ b/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape.js
@ -40,7 +40,7 @@ includes: [regExpUtils.js]
 const str = buildString({
    loneCodePoints: [],
    ranges: [
-        [0x0030, 0x0039],
+        [0x000030, 0x000039],
    ],
 });

--- a/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape-flags-u.js
+++ b/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape-flags-u.js
@ -40,8 +40,10 @@ includes: [regExpUtils.js]
 const str = buildString({
    loneCodePoints: [],
    ranges: [
+        [0x00DC00, 0x00DFFF],
        [0x000000, 0x00002F],
-        [0x00003A, 0x10FFFF],
+        [0x00003A, 0x00DBFF],
+        [0x00E000, 0x10FFFF],
    ],
 });

--- a/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape-plus-quantifier-flags-u.js
+++ b/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape-plus-quantifier-flags-u.js
@ -38,11 +38,13 @@ includes: [regExpUtils.js]
 ---*/

 const str = buildString({
-  loneCodePoints: [],
-  ranges: [
-      [0x000000, 0x00002F],
-      [0x00003A, 0x10FFFF],
-  ],
+    loneCodePoints: [],
+    ranges: [
+        [0x00DC00, 0x00DFFF],
+        [0x000000, 0x00002F],
+        [0x00003A, 0x00DBFF],
+        [0x00E000, 0x10FFFF],
+    ],
 });

 const re = /\D+/ug;
--- a/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape-plus-quantifier.js
+++ b/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape-plus-quantifier.js
@ -38,11 +38,13 @@ includes: [regExpUtils.js]
 ---*/

 const str = buildString({
-  loneCodePoints: [],
-  ranges: [
-      [0x000000, 0x00002F],
-      [0x00003A, 0x00FFFF],
-  ],
+    loneCodePoints: [],
+    ranges: [
+        [0x00DC00, 0x00DFFF],
+        [0x000000, 0x00002F],
+        [0x00003A, 0x00DBFF],
+        [0x00E000, 0x00FFFF],
+    ],
 });

 const re = /\D+/g;
--- a/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape.js
+++ b/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape.js
@ -38,11 +38,13 @@ includes: [regExpUtils.js]
 ---*/

 const str = buildString({
-  loneCodePoints: [],
-  ranges: [
-      [0x000000, 0x00002F],
-      [0x00003A, 0x00FFFF],
-  ],
+    loneCodePoints: [],
+    ranges: [
+        [0x00DC00, 0x00DFFF],
+        [0x000000, 0x00002F],
+        [0x00003A, 0x00DBFF],
+        [0x00E000, 0x00FFFF],
+    ],
 });

 const re = /\D/g;
--- a/tools/regexp-generator/index.mjs
+++ b/tools/regexp-generator/index.mjs
@ -1,7 +1,8 @@
 import filenamify from 'filenamify';
 import fs from 'node:fs';
-import jsesc from 'jsesc';
+import regenerate from 'regenerate';
 import rewritePattern from 'regexpu-core';
+import ESCAPE_SETS from 'regexpu-core/data/character-class-escape-sets.js';
 import slugify from 'slugify';

 import header from './header.mjs';
@ -15,30 +16,85 @@ const patterns = {
    'non-digit class escape': '\\D',
 };

+// Pretty-printing code adapted from unicode-property-escapes-tests.
+// https://github.com/mathiasbynens/unicode-property-escapes-tests/blob/60f2dbec2b2a840ee67aa04dbd3449bb90fd2999/regenerate.js
+
+function toHex(codePoint) {
+    return '0x' + ('00000' + codePoint.toString(16).toUpperCase()).slice(-6);
+};
+
+function toTestData(reg) {
+    const data = reg.data;
+    // Iterate over the data per `(start, end)` pair.
+    let index = 0;
+    const length = data.length;
+    const loneCodePoints = [];
+    const ranges = [];
+    while (index < length) {
+        let start = data[index];
+        let end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive.
+        if (start == end) {
+            loneCodePoints.push(start);
+        } else {
+            ranges.push([start, end]);
+        }
+        index += 2;
+    }
+    return [ loneCodePoints, ranges ];
+}
+
+function prettyPrint([ loneCodePoints, ranges ]) {
+    const indent = '    ';
+    loneCodePoints = loneCodePoints.map((codePoint) => toHex(codePoint));
+    ranges = ranges.map(
+        (range) => `[${ toHex(range[0]) }, ${ toHex(range[1]) }]`
+    );
+    const loneCodePointsOutput = loneCodePoints.length ?
+        loneCodePoints.length === 1 ? `[${loneCodePoints[0]}]` :
+            `[\n${indent}${indent}${ loneCodePoints.join(`,\n${indent}${indent}`) },\n${indent}]` :
+        `[]`;
+    const rangesOutput = ranges.length ?
+        `[\n${indent}${indent}${ ranges.join(`,\n${indent}${indent}`) },\n${indent}]` :
+        `[]`;
+    return `{\n${indent}loneCodePoints: ${ loneCodePointsOutput },\n${indent}ranges: ${ rangesOutput },\n}`;
+}
+
+const LOW_SURROGATES = regenerate().addRange(0xDC00, 0xDFFF);
+
+function buildString(escapeChar, flags) {
+    const isUnicode = flags.includes('u');
+    let escapeData = ESCAPE_SETS[isUnicode ? 'UNICODE' : 'REGULAR'].get(escapeChar);
+
+    const lowSurrogates = escapeData.clone().intersection(LOW_SURROGATES);
+    if (lowSurrogates.data.length === 0) {
+        return prettyPrint(toTestData(escapeData));
+    }
+    const rest = escapeData.clone().remove(LOW_SURROGATES);
+    const [ lowLoneCodePoints, lowRanges ] = toTestData(lowSurrogates);
+    const [ loneCodePoints, ranges ] = toTestData(rest);
+    loneCodePoints.unshift(...lowLoneCodePoints);
+    ranges.unshift(...lowRanges);
+    return prettyPrint([ loneCodePoints, ranges ]);
+}
+
 function buildContent(desc, pattern, range, max, flags, skip180e) {
+    let string = buildString(pattern[1], flags);
    let method;
    let features = [];

    let content = header(`Compare range for ${desc} ${pattern} with flags ${flags}`);

    content += `
-const str = buildString({ loneCodePoints: [], ranges: [[0, ${
-    jsesc(max, { numbers: 'hexadecimal' })
-}]] });
+const str = buildString(${string});

 const re = /${pattern}/${flags};
-const matchingRange = /${range}/${flags};

 const errors = [];

-function matching(str) {
-    return str.replace(re, '') === str.replace(matchingRange, '');
-}
-
-if (!matching(str)) {
+if (!re.test(str)) {
    // Error, let's find out where
    for (const char of str) {
-        if (!matching(char)) {
+        if (!re.test(char)) {
            errors.push('0x' + char.codePointAt(0).toString(16));
        }
    }
--- a/tools/regexp-generator/package.json
+++ b/tools/regexp-generator/package.json
@ -13,8 +13,8 @@
  "license": "MIT",
  "devDependencies": {
    "filenamify": "^6.0.0",
-    "jsesc": "^3.0.2",
    "mkdirp": "^3.0.1",
+    "regenerate": "^1.4.2",
    "regexpu-core": "^6.1.1",
    "rimraf": "^6.0.1",
    "slugify": "^1.6.6"