Update tests to parse language tags as Unicode BCP 47 Locale Identifiers

harness/testIntl.js - Add now invalid tags to getInvalidLanguageTags, these tags were previously used in test files changed in this commit. - Update isCanonicalizedStructurallyValidLanguageTag regular expressions. test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js - Moved five now invalid tags to getInvalidLanguageTags function in testIntl.js test/intl402/Intl/getCanonicalLocales/preferred-grandfathered.js - All irregular grandfathered tags are invalid now - Regular grandfathered with extlang subtags are now also invalid - Regular grandfathered with variant-like subtags are still valid test/intl402/Intl/getCanonicalLocales/weird-cases.js - Revert changes from last commit - "x-u-foo" is now invalid and was moved to getInvalidLanguageTags function test/intl402/ListFormat/constructor/constructor/locales-valid.js test/intl402/RelativeTimeFormat/constructor/constructor/locales-valid.js test/intl402/Segmenter/constructor/constructor/locales-valid.js - Irregular grandfathered and privateuse only are no longer valid language tags test/intl402/language-tags-canonicalized.js - Same changes as in test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js test/intl402/language-tags-invalid.js - Invalid tags list in this file was a subset of getInvalidLanguageTags, so replaced with getInvalidLanguageTags to get more coverage test/intl402/language-tags-valid.js - Same changes as in test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js
2025-07-31 01:44:54 +02:00 · 2019-03-11 10:33:40 -07:00 · 2019-03-11 10:33:40 -07:00 · f7e8dba39b
commit f7e8dba39b
parent 94053978bc
10 changed files with 129 additions and 102 deletions
--- a/harness/testIntl.js
+++ b/harness/testIntl.js
@ -189,6 +189,19 @@ function getInvalidLanguageTags() {
    "de-1996-1996", // duplicate numeric variant
    "pt-u-ca-gregory-u-nu-latn", // duplicate singleton subtag

+    // Invalid tags starting with: https://github.com/tc39/ecma402/pull/289
+    "no-nyn", // regular grandfathered in BCP47, but invalid in UTS35
+    "i-klingon", // irregular grandfathered in BCP47, but invalid in UTS35
+    "zh-hak-CN", // language with extlang in BCP47, but invalid in UTS35
+    "sgn-ils", // language with extlang in BCP47, but invalid in UTS35
+    "x-foo", // privateuse-only in BCP47, but invalid in UTS35
+    "x-en-US-12345", // more privateuse-only variants.
+    "x-12345-12345-en-US",
+    "x-en-US-12345-12345",
+    "x-en-u-foo",
+    "x-en-u-foo-u-bar",
+    "x-u-foo",
+
    // underscores in different parts of the language tag
    "de_DE",
    "DE_de",
@ -238,27 +251,32 @@ function getInvalidLanguageTags() {
 function isCanonicalizedStructurallyValidLanguageTag(locale) {

  /**
-   * Regular expression defining BCP 47 language tags.
+   * Regular expression defining Unicode BCP 47 Locale Identifiers.
   *
-   * Spec: RFC 5646 section 2.1.
+   * Spec: https://unicode.org/reports/tr35/#Unicode_locale_identifier
   */
-  var alpha = "[a-zA-Z]",
+  var alpha = "[a-z]",
    digit = "[0-9]",
    alphanum = "(" + alpha + "|" + digit + ")",
-    regular = "(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)",
-    irregular = "(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)",
-    grandfathered = "(" + irregular + "|" + regular + ")",
-    privateuse = "(x(-[a-z0-9]{1,8})+)",
-    singleton = "(" + digit + "|[A-WY-Za-wy-z])",
-    extension = "(" + singleton + "(-" + alphanum + "{2,8})+)",
    variant = "(" + alphanum + "{5,8}|(" + digit + alphanum + "{3}))",
    region = "(" + alpha + "{2}|" + digit + "{3})",
    script = "(" + alpha + "{4})",
-    extlang = "(" + alpha + "{3}(-" + alpha + "{3}){0,2})",
-    language = "(" + alpha + "{2,3}(-" + extlang + ")?|" + alpha + "{4}|" + alpha + "{5,8})",
-    langtag = language + "(-" + script + ")?(-" + region + ")?(-" + variant + ")*(-" + extension + ")*(-" + privateuse + ")?",
-    languageTag = "^(" + langtag + "|" + privateuse + "|" + grandfathered + ")$",
+    language = "(" + alpha + "{2,3}|" + alpha + "{5,8})",
+    privateuse = "(x(-[a-z0-9]{1,8})+)",
+    singleton = "(" + digit + "|[a-wy-z])",
+    attribute= "(" + alphanum + "{3,8})",
+    keyword = "(" + alphanum + alpha + "(-" + alphanum + "{3,8})*)",
+    unicode_locale_extensions = "(u((-" + keyword + ")+|((-" + attribute + ")+(-" + keyword + ")*)))",
+    tlang = "(" + language + "(-" + script + ")?(-" + region + ")?(-" + variant + ")*)",
+    tfield = "(" + alpha + digit + "(-" + alphanum + "{3,8})+)",
+    transformed_extensions = "(t((-" + tlang + "(-" + tfield + ")*)|(-" + tfield + ")+))",
+    other_singleton = "(" + digit + "|[a-sv-wy-z])",
+    other_extensions = "(" + other_singleton + "(-" + alphanum + "{2,8})+)",
+    extension = "(" + unicode_locale_extensions + "|" + transformed_extensions + "|" + other_extensions + ")",
+    locale_id = language + "(-" + script + ")?(-" + region + ")?(-" + variant + ")*(-" + extension + ")*(-" + privateuse + ")?",
+    languageTag = "^(" + locale_id + ")$",
    languageTagRE = new RegExp(languageTag, "i");
+
  var duplicateSingleton = "-" + singleton + "-(.*-)?\\1(?!" + alphanum + ")",
    duplicateSingletonRE = new RegExp(duplicateSingleton, "i"),
    duplicateVariant = "(" + alphanum + "{2,8}-)+" + variant + "-(" + alphanum + "{2,8}-)*\\3(?!" + alphanum + ")",
@ -266,7 +284,7 @@ function isCanonicalizedStructurallyValidLanguageTag(locale) {


  /**
-   * Verifies that the given string is a well-formed BCP 47 language tag
+   * Verifies that the given string is a well-formed Unicode BCP 47 Locale Identifier
   * with no duplicate variant or singleton subtags.
   *
   * Spec: ECMAScript Internationalization API Specification, draft, 6.2.2.
--- a/test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js
+++ b/test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js
@ -37,15 +37,10 @@ var canonicalizedTags = {
  "es-419-u-nu-latn": "es-419-u-nu-latn",
  "cmn-hans-cn-u-ca-t-ca-x-t-u": "zh-Hans-CN-t-ca-u-ca-x-t-u",
  "de-gregory-u-ca-gregory": "de-gregory-u-ca-gregory",
-  "no-nyn": "nn",
-  "i-klingon": "tlh",
  "sgn-GR": "gss",
  "ji": "yi",
  "de-DD": "de-DE",
-  "zh-hak-CN": "hak-CN",
-  "sgn-ils": "ils",
  "in": "id",
-  "x-foo": "und-x-foo",
  "sr-cyrl-ekavsk": "sr-Cyrl-ekavsk",
  "en-ca-newfound": "en-CA-newfound",
  "sl-rozaj-biske-1994": "sl-rozaj-biske-1994",
--- a/test/intl402/Intl/getCanonicalLocales/preferred-grandfathered.js
+++ b/test/intl402/Intl/getCanonicalLocales/preferred-grandfathered.js
@ -20,67 +20,77 @@ info: |
      ...

  6.2.3 CanonicalizeLanguageTag ( locale )
-    The CanonicalizeLanguageTag abstract operation returns the canonical and case-regularized
-    form of the locale argument (which must be a String value that is a structurally valid
-    BCP 47 language tag as verified by the IsStructurallyValidLanguageTag abstract operation).
-    A conforming implementation shall take the steps specified in RFC 5646 section 4.5, or
-    successor, to bring the language tag into canonical form, and to regularize the case of
-    the subtags. Furthermore, a conforming implementation shall not take the steps to bring
-    a language tag into "extlang form", nor shall it reorder variant subtags.
-
-    The specifications for extensions to BCP 47 language tags, such as RFC 6067, may include
-    canonicalization rules for the extension subtag sequences they define that go beyond the
-    canonicalization rules of RFC 5646 section 4.5. Implementations are allowed, but not
-    required, to apply these additional rules.
+    The CanonicalizeLanguageTag abstract operation returns the canonical and case-regularized form
+    of the locale argument (which must be a String value that is a structurally valid Unicode
+    BCP 47 Locale Identifier as verified by the IsStructurallyValidLanguageTag abstract operation).
+    A conforming implementation shall take the steps specified in the “BCP 47 Language Tag to
+    Unicode BCP 47 Locale Identifier” algorithm, from Unicode Technical Standard #35 LDML
+    § 3.3.1 BCP 47 Language Tag Conversion.

 includes: [testIntl.js]
 ---*/

 // Generated from http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
 // File-Date: 2017-08-15
-var canonicalizedTags = {
-  // Irregular tags.
-  "en-gb-oed": "en-GB-oxendict",
-  "i-ami": "ami",
-  "i-bnn": "bnn",
-  "i-default": "und-x-i-default",
-  "i-enochian": "und-x-i-enochian",
-  "i-hak": "hak",
-  "i-klingon": "tlh",
-  "i-lux": "lb",
-  "i-mingo": "und-x-i-mingo",
-  "i-navajo": "nv",
-  "i-pwn": "pwn",
-  "i-tao": "tao",
-  "i-tay": "tay",
-  "i-tsu": "tsu",
-  "sgn-be-fr": "sfb",
-  "sgn-be-nl": "vgt",
-  "sgn-ch-de": "sgg",

-  // Regular tags.
+var irregularGrandfathered = [
+  "en-gb-oed",
+  "i-ami",
+  "i-bnn",
+  "i-default",
+  "i-enochian",
+  "i-hak",
+  "i-klingon",
+  "i-lux",
+  "i-mingo",
+  "i-navajo",
+  "i-pwn",
+  "i-tao",
+  "i-tay",
+  "i-tsu",
+  "sgn-be-fr",
+  "sgn-be-nl",
+  "sgn-ch-de",
+];
+
+var regularGrandfatheredNonUTS35 = [
+  "no-bok",
+  "no-nyn",
+  "zh-min",
+  "zh-min-nan",
+];
+
+var regularGrandfatheredUTS35 = {
  "art-lojban": "jbo",
  "cel-gaulish": "und-x-cel-gaulish",
-  "no-bok": "nb",
-  "no-nyn": "nn",
  "zh-guoyu": "zh",
  "zh-hakka": "hak",
-  "zh-min": "und-x-zh-min",
-  "zh-min-nan": "nan",
  "zh-xiang": "hsn",
 };

 // make sure the data above is correct
-Object.getOwnPropertyNames(canonicalizedTags).forEach(function (tag) {
-  var canonicalizedTag = canonicalizedTags[tag];
+irregularGrandfathered.forEach(function (tag) {
+  assert.sameValue(
+    isCanonicalizedStructurallyValidLanguageTag(tag), false,
+    "Test data \"" + tag + "\" is not a structurally valid language tag."
+  );
+});
+regularGrandfatheredNonUTS35.forEach(function (tag) {
+  assert.sameValue(
+    isCanonicalizedStructurallyValidLanguageTag(tag), false,
+    "Test data \"" + tag + "\" is not a structurally valid language tag."
+  );
+});
+Object.getOwnPropertyNames(regularGrandfatheredUTS35).forEach(function (tag) {
+  var canonicalizedTag = regularGrandfatheredUTS35[tag];
  assert(
    isCanonicalizedStructurallyValidLanguageTag(canonicalizedTag),
-    "Test data \"" + canonicalizedTag + "\" is not canonicalized and structurally valid language tag."
+    "Test data \"" + canonicalizedTag + "\" is a canonicalized and structurally valid language tag."
  );
 });

-Object.getOwnPropertyNames(canonicalizedTags).forEach(function (tag) {
+Object.getOwnPropertyNames(regularGrandfatheredUTS35).forEach(function (tag) {
  var canonicalLocales = Intl.getCanonicalLocales(tag);
  assert.sameValue(canonicalLocales.length, 1);
-  assert.sameValue(canonicalLocales[0], canonicalizedTags[tag]);
+  assert.sameValue(canonicalLocales[0], regularGrandfatheredUTS35[tag]);
 });
--- a/test/intl402/Intl/getCanonicalLocales/weird-cases.js
+++ b/test/intl402/Intl/getCanonicalLocales/weird-cases.js
@ -13,13 +13,12 @@ includes: [compareArray.js]

 var weirdCases =
  [
-   {locale: "x-u-foo", canonical: "und-x-u-foo"},
-   {locale: "en-x-u-foo"},
-   {locale: "en-a-bar-x-u-foo"},
-   {locale: "en-x-u-foo-a-bar"},
-   {locale: "en-a-bar-u-baz-x-u-foo"},
+   "en-x-u-foo",
+   "en-a-bar-x-u-foo",
+   "en-x-u-foo-a-bar",
+   "en-a-bar-u-baz-x-u-foo",
  ];

-weirdCases.forEach(function ({locale, canonical = locale}) {
-  assert(compareArray(Intl.getCanonicalLocales(locale), [canonical]));
+weirdCases.forEach(function (weird) {
+  assert(compareArray(Intl.getCanonicalLocales(weird), [weird]));
 });
--- a/test/intl402/ListFormat/constructor/constructor/locales-valid.js
+++ b/test/intl402/ListFormat/constructor/constructor/locales-valid.js
@ -17,17 +17,28 @@ const tests = [
  [undefined, defaultLocale, "undefined"],
  ["EN", "en", "Single value"],
  [[], defaultLocale, "Empty array"],
-  [["en-GB-oed"], "en-GB", "Grandfathered"],
-  [["x-private"], defaultLocale, "Private", ["lookup"]],
  [["en", "EN"], "en", "Duplicate value (canonical first)"],
  [["EN", "en"], "en", "Duplicate value (canonical last)"],
  [{ 0: "DE", length: 0 }, defaultLocale, "Object with zero length"],
  [{ 0: "DE", length: 1 }, "de", "Object with length"],
 ];

+const errorTests = [
+  [["en-GB-oed"], "Grandfathered"],
+  [["x-private"], "Private", ["lookup"]],
+];
+
 for (const [locales, expected, name, matchers = ["lookup", "best fit"]] of tests) {
  for (const matcher of matchers) {
    const rtf = new Intl.ListFormat(locales, {localeMatcher: matcher});
    assert.sameValue(rtf.resolvedOptions().locale, expected, name);
  }
 }
+
+for (const [locales, name, matchers = ["lookup", "best fit"]] of errorTests) {
+  for (const matcher of matchers) {
+    assert.throws(RangeError, function() {
+      new Intl.ListFormat(locales, {localeMatcher: matcher})
+    }, name);
+  }
+}
--- a/test/intl402/RelativeTimeFormat/constructor/constructor/locales-valid.js
+++ b/test/intl402/RelativeTimeFormat/constructor/constructor/locales-valid.js
@ -17,17 +17,28 @@ const tests = [
  [undefined, defaultLocale, "undefined"],
  ["EN", "en", "Single value"],
  [[], defaultLocale, "Empty array"],
-  [["en-GB-oed"], "en-GB", "Grandfathered"],
-  [["x-private"], defaultLocale, "Private", ["lookup"]],
  [["en", "EN"], "en", "Duplicate value (canonical first)"],
  [["EN", "en"], "en", "Duplicate value (canonical last)"],
  [{ 0: "DE", length: 0 }, defaultLocale, "Object with zero length"],
  [{ 0: "DE", length: 1 }, "de", "Object with length"],
 ];

+const errorTests = [
+  [["en-GB-oed"], "Grandfathered"],
+  [["x-private"], "Private", ["lookup"]],
+];
+
 for (const [locales, expected, name, matchers = ["best fit", "lookup"]] of tests) {
  for (const matcher of matchers) {
    const rtf = new Intl.RelativeTimeFormat(locales, {localeMatcher: matcher});
    assert.sameValue(rtf.resolvedOptions().locale, expected, name);
  }
 }
+
+for (const [locales, name, matchers = ["best fit", "lookup"]] of errorTests) {
+  for (const matcher of matchers) {
+    assert.throws(RangeError, function() {
+      new Intl.RelativeTimeFormat(locales, {localeMatcher: matcher});
+    }, name);
+  }
+}
--- a/test/intl402/Segmenter/constructor/constructor/locales-valid.js
+++ b/test/intl402/Segmenter/constructor/constructor/locales-valid.js
@ -21,8 +21,6 @@ const tests = [
  [["sr"], ["sr"], "Single-element array"],
  [["fr", "ar"], ["fr", "ar"], "Two-element array"],
  [["xyz", "ar"], ["ar"], "Two-element array with unknown code"],
-  [["en-GB-oed"], ["en-GB"], "Grandfathered"],
-  [["x-private"], [defaultLocale], "Private", ["lookup"]],
  [["en", "EN"], ["en"], "Duplicate value (canonical first)"],
  [["EN", "en"], ["en"], "Duplicate value (canonical last)"],
  [{ 0: "DE", length: 0 }, [defaultLocale], "Object with zero length"],
@ -31,6 +29,11 @@ const tests = [
  [{ 1: "ja", 2: "fr" }, [defaultLocale], "Object without length, indexed from 1"],
 ];

+const errorTests = [
+  [["en-GB-oed"], "Grandfathered"],
+  [["x-private"], "Private", ["lookup"]],
+];
+
 for (const [locales, expected, name, matchers = ["best fit", "lookup"]] of tests) {
  for (const localeMatcher of matchers) {
    const segmenter = new Intl.Segmenter(locales, { localeMatcher });
@ -38,3 +41,11 @@ for (const [locales, expected, name, matchers = ["best fit", "lookup"]] of tests
    assert(expected.includes(actual), `${name}: expected one of ${expected}, found ${actual}`);
  }
 }
+
+for (const [locales, name, matchers = ["best fit", "lookup"]] of errorTests) {
+  for (const localeMatcher of matchers) {
+    assert.throws(RangeError, function() {
+      new Intl.Segmenter(locales, { localeMatcher });
+    }, name);
+  }
+}
--- a/test/intl402/language-tags-canonicalized.js
+++ b/test/intl402/language-tags-canonicalized.js
@ -21,15 +21,10 @@ var canonicalizedTags = {
    // -u-ca is incomplete, so it will not show up in resolvedOptions().locale
    "cmn-hans-cn-u-ca-t-ca-x-t-u": ["zh-Hans-CN-t-ca-u-ca-x-t-u", "zh-Hans-CN-t-ca-x-t-u", "zh-Hans-CN-t-ca-x-t", "zh-Hans-CN-t-ca", "zh-Hans-CN", "zh-Hans", "zh"],
    "de-gregory-u-ca-gregory": ["de-gregory-u-ca-gregory", "de-gregory", "de-u-ca-gregory", "de"],
-    "no-nyn": ["nn"],
-    "i-klingon": ["tlh"],
    "sgn-GR": ["gss"],
    "ji": ["yi"],
    "de-DD": ["de-DE", "de"],
-    "zh-hak-CN": ["hak-CN", "hak"],
-    "sgn-ils": ["ils"],
    "in": ["id"],
-    "x-foo": ["x-foo"]
 };

 // make sure the data above is correct
--- a/test/intl402/language-tags-invalid.js
+++ b/test/intl402/language-tags-invalid.js
@ -10,24 +10,7 @@ author: Norbert Lindenberg
 includes: [testIntl.js]
 ---*/

-var invalidLanguageTags = [
-    "", // empty tag
-    "i", // singleton alone
-    "x", // private use without subtag
-    "u", // extension singleton in first place
-    "419", // region code in first place
-    "u-nu-latn-cu-bob", // extension sequence without language
-    "hans-cmn-cn", // "hans" could theoretically be a 4-letter language code,
-                   // but those can't be followed by extlang codes.
-    "cmn-hans-cn-u-u", // duplicate singleton
-    "cmn-hans-cn-t-u-ca-u", // duplicate singleton
-    "de-gregory-gregory", // duplicate variant
-    "*", // language range
-    "de-*", // language range
-    "中文", // non-ASCII letters
-    "en-ß", // non-ASCII letters
-    "ıd" // non-ASCII letters
-];
+var invalidLanguageTags = getInvalidLanguageTags();

 testWithIntlConstructors(function (Constructor) {
    invalidLanguageTags.forEach(function (tag) {
--- a/test/intl402/language-tags-valid.js
+++ b/test/intl402/language-tags-valid.js
@ -18,15 +18,9 @@ var validLanguageTags = [
    "cmn-hans-cn", // + ISO 3166-1 country code
    "es-419", // + UN M.49 region code
    "es-419-u-nu-latn-cu-bob", // + Unicode locale extension sequence
-    "i-klingon", // grandfathered tag
    "cmn-hans-cn-t-ca-u-ca-x-t-u", // singleton subtags can also be used as private use subtags
    "de-gregory-u-ca-gregory", // variant and extension subtags may be the same
    "aa-a-foo-x-a-foo-bar", // variant subtags can also be used as private use subtags
-    "x-en-US-12345", // anything goes in private use tags
-    "x-12345-12345-en-US",
-    "x-en-US-12345-12345",
-    "x-en-u-foo",
-    "x-en-u-foo-u-bar"
 ];

 testWithIntlConstructors(function (Constructor) {