Aurèle 05fbae4993 Fix Regex Character Class Escape Tests
For each character class escape (\d, \D, \s, \S, \w, \W), check
positive cases (the escape matches all characters it's supposed to
match) and negative cases (the escape doesn't match any of the
characters it should not match).  Each of these checks is also done in
Unicode mode and with the v flag.

This uses regenerate.js from the unicode-property-escapes-tests
repo to generate strings that contain exactly the characters that
are supposed to be matched or not matched for each escape.

Comparison is done with regex test instead of regex replace to
optimize the tests.

This is part of my work at the SYSTEMF lab at EPFL.

Avoid modifying the regenerate library object prototype.
2025-03-13 11:58:44 -07:00

181 lines
5.1 KiB
JavaScript

import filenamify from 'filenamify';
import fs from 'node:fs';
import regenerate from 'regenerate';
import rewritePattern from 'regexpu-core';
import ESCAPE_SETS from 'regexpu-core/data/character-class-escape-sets.js';
import slugify from 'slugify';
import header from './header.mjs';
// Pretty-printing code adapted from unicode-property-escapes-tests.
// https://github.com/mathiasbynens/unicode-property-escapes-tests/blob/60f2dbec2b2a840ee67aa04dbd3449bb90fd2999/regenerate.js
function toHex(codePoint) {
return '0x' + ('00000' + codePoint.toString(16).toUpperCase()).slice(-6);
};
function toTestData(reg) {
const data = reg.data;
// Iterate over the data per `(start, end)` pair.
let index = 0;
const length = data.length;
const loneCodePoints = [];
const ranges = [];
while (index < length) {
let start = data[index];
let end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive.
if (start == end) {
loneCodePoints.push(start);
} else {
ranges.push([start, end]);
}
index += 2;
}
return [ loneCodePoints, ranges ];
}
function prettyPrint([ loneCodePoints, ranges ]) {
const indent = ' '; // Test 262 uses two-space indents.
loneCodePoints = loneCodePoints.map((codePoint) => toHex(codePoint));
ranges = ranges.map(
(range) => `[${ toHex(range[0]) }, ${ toHex(range[1]) }]`
);
const loneCodePointsOutput = loneCodePoints.length ?
`[\n${indent}${indent}${ loneCodePoints.join(`,\n${indent}${indent}`) }\n${indent}]` :
`[]`;
const rangesOutput = ranges.length ?
`[\n${indent}${indent}${ ranges.join(`,\n${indent}${indent}`) }\n${indent}]` :
`[]`;
return `{\n${indent}loneCodePoints: ${ loneCodePointsOutput },\n${indent}ranges: ${ rangesOutput }\n}`;
}
const LOW_SURROGATES = regenerate().addRange(0xDC00, 0xDFFF);
function toTestCode(escapeData) {
const lowSurrogates = escapeData.clone().intersection(LOW_SURROGATES);
if (lowSurrogates.data.length === 0) {
return prettyPrint(toTestData(escapeData));
}
const rest = escapeData.clone().remove(LOW_SURROGATES);
const [ lowLoneCodePoints, lowRanges ] = toTestData(lowSurrogates);
const [ loneCodePoints, ranges ] = toTestData(rest);
loneCodePoints.unshift(...lowLoneCodePoints);
ranges.unshift(...lowRanges);
return prettyPrint([ loneCodePoints, ranges ]);
}
// The different character class escapes.
const patterns = {
's': 'whitespace class escape',
'S': 'non-whitespace class escape',
'w': 'word class escape',
'W': 'non-word class escape',
'd': 'digit class escape',
'D': 'non-digit class escape',
};
const negation = {
's': 'S',
'S': 's',
'w': 'W',
'W': 'w',
'd': 'D',
'D': 'd',
}
// In each test file, test all these flag configurations.
const flags_configs = {
'standard': '',
'unicode': 'u',
'vflag': 'v',
}
// For each character class escape, test positive and negative cases.
const test_cases = [
{ positivity: true,
suffix: '-positive-cases' },
{ positivity: false,
suffix: '-negative-cases' },
]
function buildRegex(pattern, positivity) {
return positivity ? `^\\${pattern}+$` : `\\${pattern}`;
}
function buildRegexes(pattern, positivity) {
const regex = buildRegex(pattern, positivity);
let regStr = '';
for (const [regexname, flags] of Object.entries(flags_configs)) {
regStr += `const ${regexname} = /${regex}/${flags};\n`;
}
const allRegexes = Object.keys(flags_configs).toString();
regStr += `const regexes = [${allRegexes}];`;
return regStr;
}
function buildString(pattern, positivity) {
const escape = positivity ? pattern : negation[pattern];
const escapeData = ESCAPE_SETS.UNICODE.get(escape);
return toTestCode(escapeData);
}
function buildDescr(pattern, positivity) {
let name = patterns[pattern];
let descr = positivity ? 'Check positive cases of' : 'Check negative cases of';
return `${descr} ${name} \\${pattern}.`;
}
function buildContent(pattern, positivity) {
let regexes = buildRegexes(pattern, positivity);
let string = buildString(pattern, positivity);
let descr = buildDescr(pattern, positivity);
let testNegate = positivity ? '!' : '';
let errMsg = positivity ? 'Expected full match, but did not match: ' :
'Expected no match, but matched: ';
let content = header(`${descr}`);
content += `
const str = buildString(
${string}
);
${regexes}
const errors = [];
for (const regex of regexes) {
if (${testNegate}regex.test(str)) {
// Error, let's find out where
for (const char of str) {
if (${testNegate}regex.test(char)) {
errors.push('0x' + char.codePointAt(0).toString(16));
}
}
}
}
assert.sameValue(
errors.length,
0,
'${errMsg}' + errors.join(',')
);
`;
return content;
}
function writeFile(desc, content, suffix = '') {
const outPath = '../../test/built-ins/RegExp/CharacterClassEscapes';
const filename = `${outPath}/character-class-${slugify(filenamify(desc.toLowerCase()))}${suffix}.js`;
fs.writeFileSync(filename, content);
}
for (const [pattern, desc] of Object.entries(patterns)) {
test_cases.forEach(({positivity, suffix}) => {
const content = buildContent(pattern, positivity);
writeFile(desc, content, suffix);
});
}