test262/tools/regexp-generator/index.mjs

import filenamify from 'filenamify';
import fs from 'node:fs';
import regenerate from 'regenerate';
import rewritePattern from 'regexpu-core';
import ESCAPE_SETS from 'regexpu-core/data/character-class-escape-sets.js';
import slugify from 'slugify';

import header from './header.mjs';

// Pretty-printing code adapted from unicode-property-escapes-tests.
// https://github.com/mathiasbynens/unicode-property-escapes-tests/blob/60f2dbec2b2a840ee67aa04dbd3449bb90fd2999/regenerate.js

function toHex(codePoint) {
  return '0x' + ('00000' + codePoint.toString(16).toUpperCase()).slice(-6);
};

function toTestData(reg) {
  const data = reg.data;
  // Iterate over the data per `(start, end)` pair.
  let index = 0;
  const length = data.length;
  const loneCodePoints = [];
  const ranges = [];
  while (index < length) {
    let start = data[index];
    let end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive.
    if (start == end) {
      loneCodePoints.push(start);
    } else {
      ranges.push([start, end]);
    }
    index += 2;
  }
  return [ loneCodePoints, ranges ];
}

function prettyPrint([ loneCodePoints, ranges ]) {
  const indent = '  '; // Test 262 uses two-space indents.
  loneCodePoints = loneCodePoints.map((codePoint) => toHex(codePoint));
  ranges = ranges.map(
    (range) => `[${ toHex(range[0]) }, ${ toHex(range[1]) }]`
  );
  const loneCodePointsOutput = loneCodePoints.length ?
    `[\n${indent}${indent}${ loneCodePoints.join(`,\n${indent}${indent}`) }\n${indent}]` :
    `[]`;
  const rangesOutput = ranges.length ?
    `[\n${indent}${indent}${ ranges.join(`,\n${indent}${indent}`) }\n${indent}]` :
    `[]`;
  return `{\n${indent}loneCodePoints: ${ loneCodePointsOutput },\n${indent}ranges: ${ rangesOutput }\n}`;
}

const LOW_SURROGATES = regenerate().addRange(0xDC00, 0xDFFF);

function toTestCode(escapeData) {
  const lowSurrogates = escapeData.clone().intersection(LOW_SURROGATES);
  if (lowSurrogates.data.length === 0) {
    return prettyPrint(toTestData(escapeData));
  }
  const rest = escapeData.clone().remove(LOW_SURROGATES);
  const [ lowLoneCodePoints, lowRanges ] = toTestData(lowSurrogates);
  const [ loneCodePoints, ranges ] = toTestData(rest);
  loneCodePoints.unshift(...lowLoneCodePoints);
  ranges.unshift(...lowRanges);
  return prettyPrint([ loneCodePoints, ranges ]);
}

// The different character class escapes.
const patterns = {
  's': 'whitespace class escape',
  'S': 'non-whitespace class escape',
  'w': 'word class escape',
  'W': 'non-word class escape',
  'd': 'digit class escape',
  'D': 'non-digit class escape',
};

const negation = {
  's': 'S',
  'S': 's',
  'w': 'W',
  'W': 'w',
  'd': 'D',
  'D': 'd',
}

// In each test file, test all these flag configurations.
const flags_configs = {
  'standard': '',
  'unicode': 'u',
  'vflag': 'v',
}

// For each character class escape, test positive and negative cases.
const test_cases = [
  { positivity: true,
    suffix: '-positive-cases' },
  { positivity: false,
    suffix: '-negative-cases' },
]

function buildRegex(pattern, positivity) {
  return positivity ? `^\\${pattern}+$` : `\\${pattern}`;
}

function buildRegexes(pattern, positivity) {
  const regex = buildRegex(pattern, positivity);
  let regStr = '';
  for (const [regexname, flags] of Object.entries(flags_configs)) {
    regStr += `const ${regexname} = /${regex}/${flags};\n`;
  }
  const allRegexes = Object.keys(flags_configs).toString();
  regStr += `const regexes = [${allRegexes}];`;
  return regStr;
}

function buildString(pattern, positivity) {
  const escape = positivity ? pattern : negation[pattern];
  const escapeData = ESCAPE_SETS.UNICODE.get(escape);
  return toTestCode(escapeData);
}

function buildDescr(pattern, positivity) {
  let name = patterns[pattern];
  let descr = positivity ? 'Check positive cases of' : 'Check negative cases of';
  return `${descr} ${name} \\${pattern}.`;
}

function buildContent(pattern, positivity) {

  let regexes = buildRegexes(pattern, positivity);
  let string = buildString(pattern, positivity);
  let descr = buildDescr(pattern, positivity);
  let testNegate = positivity ? '!' : '';
  let errMsg = positivity ? 'Expected full match, but did not match: ' :
  'Expected no match, but matched: ';

  let content = header(`${descr}`);

  content += `
const str = buildString(
${string}
);

${regexes}

const errors = [];

for (const regex of regexes) {
  if (${testNegate}regex.test(str)) {
    // Error, let's find out where
    for (const char of str) {
      if (${testNegate}regex.test(char)) {
        errors.push('0x' + char.codePointAt(0).toString(16));
      }
    }
  }
}

assert.sameValue(
  errors.length,
  0,
  '${errMsg}' + errors.join(',')
);
`;

    return content;
}

function writeFile(desc, content, suffix = '') {
  const outPath = '../../test/built-ins/RegExp/CharacterClassEscapes';
  const filename = `${outPath}/character-class-${slugify(filenamify(desc.toLowerCase()))}${suffix}.js`;
  fs.writeFileSync(filename, content);
}

for (const [pattern, desc] of Object.entries(patterns)) {
  test_cases.forEach(({positivity, suffix}) => {
    const content = buildContent(pattern, positivity);
    writeFile(desc, content, suffix);
  });
}