regexp-generator: Implement downstream changes

The optimizations from commit e558b29b were never incorporated into the
upstream test generator. This does so now.

As far as I can tell, the changes to the Unicode ranges are purely
cosmetic. Some are formatted as 6-digit hex numbers instead of 4-digit.
Others move the low-surrogates range 0xDC00-0xDCFF to the beginning of the
array, but the union of the ranges is still the same.
This commit is contained in:
Philip Chimento 2024-11-01 16:49:48 -07:00 committed by Philip Chimento
parent 879326855b
commit 07ddc3b41b
10 changed files with 96 additions and 32 deletions

View File

@ -40,7 +40,7 @@ includes: [regExpUtils.js]
const str = buildString({
loneCodePoints: [],
ranges: [
[0x0030, 0x0039],
[0x000030, 0x000039],
],
});

View File

@ -40,7 +40,7 @@ includes: [regExpUtils.js]
const str = buildString({
loneCodePoints: [],
ranges: [
[0x0030, 0x0039],
[0x000030, 0x000039],
],
});

View File

@ -40,7 +40,7 @@ includes: [regExpUtils.js]
const str = buildString({
loneCodePoints: [],
ranges: [
[0x0030, 0x0039],
[0x000030, 0x000039],
],
});

View File

@ -40,7 +40,7 @@ includes: [regExpUtils.js]
const str = buildString({
loneCodePoints: [],
ranges: [
[0x0030, 0x0039],
[0x000030, 0x000039],
],
});

View File

@ -40,8 +40,10 @@ includes: [regExpUtils.js]
const str = buildString({
loneCodePoints: [],
ranges: [
[0x00DC00, 0x00DFFF],
[0x000000, 0x00002F],
[0x00003A, 0x10FFFF],
[0x00003A, 0x00DBFF],
[0x00E000, 0x10FFFF],
],
});

View File

@ -38,11 +38,13 @@ includes: [regExpUtils.js]
---*/
const str = buildString({
loneCodePoints: [],
ranges: [
[0x000000, 0x00002F],
[0x00003A, 0x10FFFF],
],
loneCodePoints: [],
ranges: [
[0x00DC00, 0x00DFFF],
[0x000000, 0x00002F],
[0x00003A, 0x00DBFF],
[0x00E000, 0x10FFFF],
],
});
const re = /\D+/ug;

View File

@ -38,11 +38,13 @@ includes: [regExpUtils.js]
---*/
const str = buildString({
loneCodePoints: [],
ranges: [
[0x000000, 0x00002F],
[0x00003A, 0x00FFFF],
],
loneCodePoints: [],
ranges: [
[0x00DC00, 0x00DFFF],
[0x000000, 0x00002F],
[0x00003A, 0x00DBFF],
[0x00E000, 0x00FFFF],
],
});
const re = /\D+/g;

View File

@ -38,11 +38,13 @@ includes: [regExpUtils.js]
---*/
const str = buildString({
loneCodePoints: [],
ranges: [
[0x000000, 0x00002F],
[0x00003A, 0x00FFFF],
],
loneCodePoints: [],
ranges: [
[0x00DC00, 0x00DFFF],
[0x000000, 0x00002F],
[0x00003A, 0x00DBFF],
[0x00E000, 0x00FFFF],
],
});
const re = /\D/g;

View File

@ -1,7 +1,8 @@
import filenamify from 'filenamify';
import fs from 'node:fs';
import jsesc from 'jsesc';
import regenerate from 'regenerate';
import rewritePattern from 'regexpu-core';
import ESCAPE_SETS from 'regexpu-core/data/character-class-escape-sets.js';
import slugify from 'slugify';
import header from './header.mjs';
@ -15,30 +16,85 @@ const patterns = {
'non-digit class escape': '\\D',
};
// Pretty-printing code adapted from unicode-property-escapes-tests.
// https://github.com/mathiasbynens/unicode-property-escapes-tests/blob/60f2dbec2b2a840ee67aa04dbd3449bb90fd2999/regenerate.js
function toHex(codePoint) {
return '0x' + ('00000' + codePoint.toString(16).toUpperCase()).slice(-6);
};
function toTestData(reg) {
const data = reg.data;
// Iterate over the data per `(start, end)` pair.
let index = 0;
const length = data.length;
const loneCodePoints = [];
const ranges = [];
while (index < length) {
let start = data[index];
let end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive.
if (start == end) {
loneCodePoints.push(start);
} else {
ranges.push([start, end]);
}
index += 2;
}
return [ loneCodePoints, ranges ];
}
function prettyPrint([ loneCodePoints, ranges ]) {
const indent = ' ';
loneCodePoints = loneCodePoints.map((codePoint) => toHex(codePoint));
ranges = ranges.map(
(range) => `[${ toHex(range[0]) }, ${ toHex(range[1]) }]`
);
const loneCodePointsOutput = loneCodePoints.length ?
loneCodePoints.length === 1 ? `[${loneCodePoints[0]}]` :
`[\n${indent}${indent}${ loneCodePoints.join(`,\n${indent}${indent}`) },\n${indent}]` :
`[]`;
const rangesOutput = ranges.length ?
`[\n${indent}${indent}${ ranges.join(`,\n${indent}${indent}`) },\n${indent}]` :
`[]`;
return `{\n${indent}loneCodePoints: ${ loneCodePointsOutput },\n${indent}ranges: ${ rangesOutput },\n}`;
}
const LOW_SURROGATES = regenerate().addRange(0xDC00, 0xDFFF);
function buildString(escapeChar, flags) {
const isUnicode = flags.includes('u');
let escapeData = ESCAPE_SETS[isUnicode ? 'UNICODE' : 'REGULAR'].get(escapeChar);
const lowSurrogates = escapeData.clone().intersection(LOW_SURROGATES);
if (lowSurrogates.data.length === 0) {
return prettyPrint(toTestData(escapeData));
}
const rest = escapeData.clone().remove(LOW_SURROGATES);
const [ lowLoneCodePoints, lowRanges ] = toTestData(lowSurrogates);
const [ loneCodePoints, ranges ] = toTestData(rest);
loneCodePoints.unshift(...lowLoneCodePoints);
ranges.unshift(...lowRanges);
return prettyPrint([ loneCodePoints, ranges ]);
}
function buildContent(desc, pattern, range, max, flags, skip180e) {
let string = buildString(pattern[1], flags);
let method;
let features = [];
let content = header(`Compare range for ${desc} ${pattern} with flags ${flags}`);
content += `
const str = buildString({ loneCodePoints: [], ranges: [[0, ${
jsesc(max, { numbers: 'hexadecimal' })
}]] });
const str = buildString(${string});
const re = /${pattern}/${flags};
const matchingRange = /${range}/${flags};
const errors = [];
function matching(str) {
return str.replace(re, '') === str.replace(matchingRange, '');
}
if (!matching(str)) {
if (!re.test(str)) {
// Error, let's find out where
for (const char of str) {
if (!matching(char)) {
if (!re.test(char)) {
errors.push('0x' + char.codePointAt(0).toString(16));
}
}

View File

@ -13,8 +13,8 @@
"license": "MIT",
"devDependencies": {
"filenamify": "^6.0.0",
"jsesc": "^3.0.2",
"mkdirp": "^3.0.1",
"regenerate": "^1.4.2",
"regexpu-core": "^6.1.1",
"rimraf": "^6.0.1",
"slugify": "^1.6.6"