mirror of https://github.com/tc39/test262.git
Update tests to parse language tags as Unicode BCP 47 Locale Identifiers
harness/testIntl.js - Add now invalid tags to getInvalidLanguageTags, these tags were previously used in test files changed in this commit. - Update isCanonicalizedStructurallyValidLanguageTag regular expressions. test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js - Moved five now invalid tags to getInvalidLanguageTags function in testIntl.js test/intl402/Intl/getCanonicalLocales/preferred-grandfathered.js - All irregular grandfathered tags are invalid now - Regular grandfathered with extlang subtags are now also invalid - Regular grandfathered with variant-like subtags are still valid test/intl402/Intl/getCanonicalLocales/weird-cases.js - Revert changes from last commit - "x-u-foo" is now invalid and was moved to getInvalidLanguageTags function test/intl402/ListFormat/constructor/constructor/locales-valid.js test/intl402/RelativeTimeFormat/constructor/constructor/locales-valid.js test/intl402/Segmenter/constructor/constructor/locales-valid.js - Irregular grandfathered and privateuse only are no longer valid language tags test/intl402/language-tags-canonicalized.js - Same changes as in test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js test/intl402/language-tags-invalid.js - Invalid tags list in this file was a subset of getInvalidLanguageTags, so replaced with getInvalidLanguageTags to get more coverage test/intl402/language-tags-valid.js - Same changes as in test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js
This commit is contained in:
parent
94053978bc
commit
f7e8dba39b
|
@ -189,6 +189,19 @@ function getInvalidLanguageTags() {
|
|||
"de-1996-1996", // duplicate numeric variant
|
||||
"pt-u-ca-gregory-u-nu-latn", // duplicate singleton subtag
|
||||
|
||||
// Invalid tags starting with: https://github.com/tc39/ecma402/pull/289
|
||||
"no-nyn", // regular grandfathered in BCP47, but invalid in UTS35
|
||||
"i-klingon", // irregular grandfathered in BCP47, but invalid in UTS35
|
||||
"zh-hak-CN", // language with extlang in BCP47, but invalid in UTS35
|
||||
"sgn-ils", // language with extlang in BCP47, but invalid in UTS35
|
||||
"x-foo", // privateuse-only in BCP47, but invalid in UTS35
|
||||
"x-en-US-12345", // more privateuse-only variants.
|
||||
"x-12345-12345-en-US",
|
||||
"x-en-US-12345-12345",
|
||||
"x-en-u-foo",
|
||||
"x-en-u-foo-u-bar",
|
||||
"x-u-foo",
|
||||
|
||||
// underscores in different parts of the language tag
|
||||
"de_DE",
|
||||
"DE_de",
|
||||
|
@ -238,27 +251,32 @@ function getInvalidLanguageTags() {
|
|||
function isCanonicalizedStructurallyValidLanguageTag(locale) {
|
||||
|
||||
/**
|
||||
* Regular expression defining BCP 47 language tags.
|
||||
* Regular expression defining Unicode BCP 47 Locale Identifiers.
|
||||
*
|
||||
* Spec: RFC 5646 section 2.1.
|
||||
* Spec: https://unicode.org/reports/tr35/#Unicode_locale_identifier
|
||||
*/
|
||||
var alpha = "[a-zA-Z]",
|
||||
var alpha = "[a-z]",
|
||||
digit = "[0-9]",
|
||||
alphanum = "(" + alpha + "|" + digit + ")",
|
||||
regular = "(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)",
|
||||
irregular = "(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)",
|
||||
grandfathered = "(" + irregular + "|" + regular + ")",
|
||||
privateuse = "(x(-[a-z0-9]{1,8})+)",
|
||||
singleton = "(" + digit + "|[A-WY-Za-wy-z])",
|
||||
extension = "(" + singleton + "(-" + alphanum + "{2,8})+)",
|
||||
variant = "(" + alphanum + "{5,8}|(" + digit + alphanum + "{3}))",
|
||||
region = "(" + alpha + "{2}|" + digit + "{3})",
|
||||
script = "(" + alpha + "{4})",
|
||||
extlang = "(" + alpha + "{3}(-" + alpha + "{3}){0,2})",
|
||||
language = "(" + alpha + "{2,3}(-" + extlang + ")?|" + alpha + "{4}|" + alpha + "{5,8})",
|
||||
langtag = language + "(-" + script + ")?(-" + region + ")?(-" + variant + ")*(-" + extension + ")*(-" + privateuse + ")?",
|
||||
languageTag = "^(" + langtag + "|" + privateuse + "|" + grandfathered + ")$",
|
||||
language = "(" + alpha + "{2,3}|" + alpha + "{5,8})",
|
||||
privateuse = "(x(-[a-z0-9]{1,8})+)",
|
||||
singleton = "(" + digit + "|[a-wy-z])",
|
||||
attribute= "(" + alphanum + "{3,8})",
|
||||
keyword = "(" + alphanum + alpha + "(-" + alphanum + "{3,8})*)",
|
||||
unicode_locale_extensions = "(u((-" + keyword + ")+|((-" + attribute + ")+(-" + keyword + ")*)))",
|
||||
tlang = "(" + language + "(-" + script + ")?(-" + region + ")?(-" + variant + ")*)",
|
||||
tfield = "(" + alpha + digit + "(-" + alphanum + "{3,8})+)",
|
||||
transformed_extensions = "(t((-" + tlang + "(-" + tfield + ")*)|(-" + tfield + ")+))",
|
||||
other_singleton = "(" + digit + "|[a-sv-wy-z])",
|
||||
other_extensions = "(" + other_singleton + "(-" + alphanum + "{2,8})+)",
|
||||
extension = "(" + unicode_locale_extensions + "|" + transformed_extensions + "|" + other_extensions + ")",
|
||||
locale_id = language + "(-" + script + ")?(-" + region + ")?(-" + variant + ")*(-" + extension + ")*(-" + privateuse + ")?",
|
||||
languageTag = "^(" + locale_id + ")$",
|
||||
languageTagRE = new RegExp(languageTag, "i");
|
||||
|
||||
var duplicateSingleton = "-" + singleton + "-(.*-)?\\1(?!" + alphanum + ")",
|
||||
duplicateSingletonRE = new RegExp(duplicateSingleton, "i"),
|
||||
duplicateVariant = "(" + alphanum + "{2,8}-)+" + variant + "-(" + alphanum + "{2,8}-)*\\3(?!" + alphanum + ")",
|
||||
|
@ -266,7 +284,7 @@ function isCanonicalizedStructurallyValidLanguageTag(locale) {
|
|||
|
||||
|
||||
/**
|
||||
* Verifies that the given string is a well-formed BCP 47 language tag
|
||||
* Verifies that the given string is a well-formed Unicode BCP 47 Locale Identifier
|
||||
* with no duplicate variant or singleton subtags.
|
||||
*
|
||||
* Spec: ECMAScript Internationalization API Specification, draft, 6.2.2.
|
||||
|
|
|
@ -37,15 +37,10 @@ var canonicalizedTags = {
|
|||
"es-419-u-nu-latn": "es-419-u-nu-latn",
|
||||
"cmn-hans-cn-u-ca-t-ca-x-t-u": "zh-Hans-CN-t-ca-u-ca-x-t-u",
|
||||
"de-gregory-u-ca-gregory": "de-gregory-u-ca-gregory",
|
||||
"no-nyn": "nn",
|
||||
"i-klingon": "tlh",
|
||||
"sgn-GR": "gss",
|
||||
"ji": "yi",
|
||||
"de-DD": "de-DE",
|
||||
"zh-hak-CN": "hak-CN",
|
||||
"sgn-ils": "ils",
|
||||
"in": "id",
|
||||
"x-foo": "und-x-foo",
|
||||
"sr-cyrl-ekavsk": "sr-Cyrl-ekavsk",
|
||||
"en-ca-newfound": "en-CA-newfound",
|
||||
"sl-rozaj-biske-1994": "sl-rozaj-biske-1994",
|
||||
|
|
|
@ -20,67 +20,77 @@ info: |
|
|||
...
|
||||
|
||||
6.2.3 CanonicalizeLanguageTag ( locale )
|
||||
The CanonicalizeLanguageTag abstract operation returns the canonical and case-regularized
|
||||
form of the locale argument (which must be a String value that is a structurally valid
|
||||
BCP 47 language tag as verified by the IsStructurallyValidLanguageTag abstract operation).
|
||||
A conforming implementation shall take the steps specified in RFC 5646 section 4.5, or
|
||||
successor, to bring the language tag into canonical form, and to regularize the case of
|
||||
the subtags. Furthermore, a conforming implementation shall not take the steps to bring
|
||||
a language tag into "extlang form", nor shall it reorder variant subtags.
|
||||
|
||||
The specifications for extensions to BCP 47 language tags, such as RFC 6067, may include
|
||||
canonicalization rules for the extension subtag sequences they define that go beyond the
|
||||
canonicalization rules of RFC 5646 section 4.5. Implementations are allowed, but not
|
||||
required, to apply these additional rules.
|
||||
The CanonicalizeLanguageTag abstract operation returns the canonical and case-regularized form
|
||||
of the locale argument (which must be a String value that is a structurally valid Unicode
|
||||
BCP 47 Locale Identifier as verified by the IsStructurallyValidLanguageTag abstract operation).
|
||||
A conforming implementation shall take the steps specified in the “BCP 47 Language Tag to
|
||||
Unicode BCP 47 Locale Identifier” algorithm, from Unicode Technical Standard #35 LDML
|
||||
§ 3.3.1 BCP 47 Language Tag Conversion.
|
||||
|
||||
includes: [testIntl.js]
|
||||
---*/
|
||||
|
||||
// Generated from http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
|
||||
// File-Date: 2017-08-15
|
||||
var canonicalizedTags = {
|
||||
// Irregular tags.
|
||||
"en-gb-oed": "en-GB-oxendict",
|
||||
"i-ami": "ami",
|
||||
"i-bnn": "bnn",
|
||||
"i-default": "und-x-i-default",
|
||||
"i-enochian": "und-x-i-enochian",
|
||||
"i-hak": "hak",
|
||||
"i-klingon": "tlh",
|
||||
"i-lux": "lb",
|
||||
"i-mingo": "und-x-i-mingo",
|
||||
"i-navajo": "nv",
|
||||
"i-pwn": "pwn",
|
||||
"i-tao": "tao",
|
||||
"i-tay": "tay",
|
||||
"i-tsu": "tsu",
|
||||
"sgn-be-fr": "sfb",
|
||||
"sgn-be-nl": "vgt",
|
||||
"sgn-ch-de": "sgg",
|
||||
|
||||
// Regular tags.
|
||||
var irregularGrandfathered = [
|
||||
"en-gb-oed",
|
||||
"i-ami",
|
||||
"i-bnn",
|
||||
"i-default",
|
||||
"i-enochian",
|
||||
"i-hak",
|
||||
"i-klingon",
|
||||
"i-lux",
|
||||
"i-mingo",
|
||||
"i-navajo",
|
||||
"i-pwn",
|
||||
"i-tao",
|
||||
"i-tay",
|
||||
"i-tsu",
|
||||
"sgn-be-fr",
|
||||
"sgn-be-nl",
|
||||
"sgn-ch-de",
|
||||
];
|
||||
|
||||
var regularGrandfatheredNonUTS35 = [
|
||||
"no-bok",
|
||||
"no-nyn",
|
||||
"zh-min",
|
||||
"zh-min-nan",
|
||||
];
|
||||
|
||||
var regularGrandfatheredUTS35 = {
|
||||
"art-lojban": "jbo",
|
||||
"cel-gaulish": "und-x-cel-gaulish",
|
||||
"no-bok": "nb",
|
||||
"no-nyn": "nn",
|
||||
"zh-guoyu": "zh",
|
||||
"zh-hakka": "hak",
|
||||
"zh-min": "und-x-zh-min",
|
||||
"zh-min-nan": "nan",
|
||||
"zh-xiang": "hsn",
|
||||
};
|
||||
|
||||
// make sure the data above is correct
|
||||
Object.getOwnPropertyNames(canonicalizedTags).forEach(function (tag) {
|
||||
var canonicalizedTag = canonicalizedTags[tag];
|
||||
irregularGrandfathered.forEach(function (tag) {
|
||||
assert.sameValue(
|
||||
isCanonicalizedStructurallyValidLanguageTag(tag), false,
|
||||
"Test data \"" + tag + "\" is not a structurally valid language tag."
|
||||
);
|
||||
});
|
||||
regularGrandfatheredNonUTS35.forEach(function (tag) {
|
||||
assert.sameValue(
|
||||
isCanonicalizedStructurallyValidLanguageTag(tag), false,
|
||||
"Test data \"" + tag + "\" is not a structurally valid language tag."
|
||||
);
|
||||
});
|
||||
Object.getOwnPropertyNames(regularGrandfatheredUTS35).forEach(function (tag) {
|
||||
var canonicalizedTag = regularGrandfatheredUTS35[tag];
|
||||
assert(
|
||||
isCanonicalizedStructurallyValidLanguageTag(canonicalizedTag),
|
||||
"Test data \"" + canonicalizedTag + "\" is not canonicalized and structurally valid language tag."
|
||||
"Test data \"" + canonicalizedTag + "\" is a canonicalized and structurally valid language tag."
|
||||
);
|
||||
});
|
||||
|
||||
Object.getOwnPropertyNames(canonicalizedTags).forEach(function (tag) {
|
||||
Object.getOwnPropertyNames(regularGrandfatheredUTS35).forEach(function (tag) {
|
||||
var canonicalLocales = Intl.getCanonicalLocales(tag);
|
||||
assert.sameValue(canonicalLocales.length, 1);
|
||||
assert.sameValue(canonicalLocales[0], canonicalizedTags[tag]);
|
||||
assert.sameValue(canonicalLocales[0], regularGrandfatheredUTS35[tag]);
|
||||
});
|
||||
|
|
|
@ -13,13 +13,12 @@ includes: [compareArray.js]
|
|||
|
||||
var weirdCases =
|
||||
[
|
||||
{locale: "x-u-foo", canonical: "und-x-u-foo"},
|
||||
{locale: "en-x-u-foo"},
|
||||
{locale: "en-a-bar-x-u-foo"},
|
||||
{locale: "en-x-u-foo-a-bar"},
|
||||
{locale: "en-a-bar-u-baz-x-u-foo"},
|
||||
"en-x-u-foo",
|
||||
"en-a-bar-x-u-foo",
|
||||
"en-x-u-foo-a-bar",
|
||||
"en-a-bar-u-baz-x-u-foo",
|
||||
];
|
||||
|
||||
weirdCases.forEach(function ({locale, canonical = locale}) {
|
||||
assert(compareArray(Intl.getCanonicalLocales(locale), [canonical]));
|
||||
weirdCases.forEach(function (weird) {
|
||||
assert(compareArray(Intl.getCanonicalLocales(weird), [weird]));
|
||||
});
|
||||
|
|
|
@ -17,17 +17,28 @@ const tests = [
|
|||
[undefined, defaultLocale, "undefined"],
|
||||
["EN", "en", "Single value"],
|
||||
[[], defaultLocale, "Empty array"],
|
||||
[["en-GB-oed"], "en-GB", "Grandfathered"],
|
||||
[["x-private"], defaultLocale, "Private", ["lookup"]],
|
||||
[["en", "EN"], "en", "Duplicate value (canonical first)"],
|
||||
[["EN", "en"], "en", "Duplicate value (canonical last)"],
|
||||
[{ 0: "DE", length: 0 }, defaultLocale, "Object with zero length"],
|
||||
[{ 0: "DE", length: 1 }, "de", "Object with length"],
|
||||
];
|
||||
|
||||
const errorTests = [
|
||||
[["en-GB-oed"], "Grandfathered"],
|
||||
[["x-private"], "Private", ["lookup"]],
|
||||
];
|
||||
|
||||
for (const [locales, expected, name, matchers = ["lookup", "best fit"]] of tests) {
|
||||
for (const matcher of matchers) {
|
||||
const rtf = new Intl.ListFormat(locales, {localeMatcher: matcher});
|
||||
assert.sameValue(rtf.resolvedOptions().locale, expected, name);
|
||||
}
|
||||
}
|
||||
|
||||
for (const [locales, name, matchers = ["lookup", "best fit"]] of errorTests) {
|
||||
for (const matcher of matchers) {
|
||||
assert.throws(RangeError, function() {
|
||||
new Intl.ListFormat(locales, {localeMatcher: matcher})
|
||||
}, name);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,17 +17,28 @@ const tests = [
|
|||
[undefined, defaultLocale, "undefined"],
|
||||
["EN", "en", "Single value"],
|
||||
[[], defaultLocale, "Empty array"],
|
||||
[["en-GB-oed"], "en-GB", "Grandfathered"],
|
||||
[["x-private"], defaultLocale, "Private", ["lookup"]],
|
||||
[["en", "EN"], "en", "Duplicate value (canonical first)"],
|
||||
[["EN", "en"], "en", "Duplicate value (canonical last)"],
|
||||
[{ 0: "DE", length: 0 }, defaultLocale, "Object with zero length"],
|
||||
[{ 0: "DE", length: 1 }, "de", "Object with length"],
|
||||
];
|
||||
|
||||
const errorTests = [
|
||||
[["en-GB-oed"], "Grandfathered"],
|
||||
[["x-private"], "Private", ["lookup"]],
|
||||
];
|
||||
|
||||
for (const [locales, expected, name, matchers = ["best fit", "lookup"]] of tests) {
|
||||
for (const matcher of matchers) {
|
||||
const rtf = new Intl.RelativeTimeFormat(locales, {localeMatcher: matcher});
|
||||
assert.sameValue(rtf.resolvedOptions().locale, expected, name);
|
||||
}
|
||||
}
|
||||
|
||||
for (const [locales, name, matchers = ["best fit", "lookup"]] of errorTests) {
|
||||
for (const matcher of matchers) {
|
||||
assert.throws(RangeError, function() {
|
||||
new Intl.RelativeTimeFormat(locales, {localeMatcher: matcher});
|
||||
}, name);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,8 +21,6 @@ const tests = [
|
|||
[["sr"], ["sr"], "Single-element array"],
|
||||
[["fr", "ar"], ["fr", "ar"], "Two-element array"],
|
||||
[["xyz", "ar"], ["ar"], "Two-element array with unknown code"],
|
||||
[["en-GB-oed"], ["en-GB"], "Grandfathered"],
|
||||
[["x-private"], [defaultLocale], "Private", ["lookup"]],
|
||||
[["en", "EN"], ["en"], "Duplicate value (canonical first)"],
|
||||
[["EN", "en"], ["en"], "Duplicate value (canonical last)"],
|
||||
[{ 0: "DE", length: 0 }, [defaultLocale], "Object with zero length"],
|
||||
|
@ -31,6 +29,11 @@ const tests = [
|
|||
[{ 1: "ja", 2: "fr" }, [defaultLocale], "Object without length, indexed from 1"],
|
||||
];
|
||||
|
||||
const errorTests = [
|
||||
[["en-GB-oed"], "Grandfathered"],
|
||||
[["x-private"], "Private", ["lookup"]],
|
||||
];
|
||||
|
||||
for (const [locales, expected, name, matchers = ["best fit", "lookup"]] of tests) {
|
||||
for (const localeMatcher of matchers) {
|
||||
const segmenter = new Intl.Segmenter(locales, { localeMatcher });
|
||||
|
@ -38,3 +41,11 @@ for (const [locales, expected, name, matchers = ["best fit", "lookup"]] of tests
|
|||
assert(expected.includes(actual), `${name}: expected one of ${expected}, found ${actual}`);
|
||||
}
|
||||
}
|
||||
|
||||
for (const [locales, name, matchers = ["best fit", "lookup"]] of errorTests) {
|
||||
for (const localeMatcher of matchers) {
|
||||
assert.throws(RangeError, function() {
|
||||
new Intl.Segmenter(locales, { localeMatcher });
|
||||
}, name);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,15 +21,10 @@ var canonicalizedTags = {
|
|||
// -u-ca is incomplete, so it will not show up in resolvedOptions().locale
|
||||
"cmn-hans-cn-u-ca-t-ca-x-t-u": ["zh-Hans-CN-t-ca-u-ca-x-t-u", "zh-Hans-CN-t-ca-x-t-u", "zh-Hans-CN-t-ca-x-t", "zh-Hans-CN-t-ca", "zh-Hans-CN", "zh-Hans", "zh"],
|
||||
"de-gregory-u-ca-gregory": ["de-gregory-u-ca-gregory", "de-gregory", "de-u-ca-gregory", "de"],
|
||||
"no-nyn": ["nn"],
|
||||
"i-klingon": ["tlh"],
|
||||
"sgn-GR": ["gss"],
|
||||
"ji": ["yi"],
|
||||
"de-DD": ["de-DE", "de"],
|
||||
"zh-hak-CN": ["hak-CN", "hak"],
|
||||
"sgn-ils": ["ils"],
|
||||
"in": ["id"],
|
||||
"x-foo": ["x-foo"]
|
||||
};
|
||||
|
||||
// make sure the data above is correct
|
||||
|
|
|
@ -10,24 +10,7 @@ author: Norbert Lindenberg
|
|||
includes: [testIntl.js]
|
||||
---*/
|
||||
|
||||
var invalidLanguageTags = [
|
||||
"", // empty tag
|
||||
"i", // singleton alone
|
||||
"x", // private use without subtag
|
||||
"u", // extension singleton in first place
|
||||
"419", // region code in first place
|
||||
"u-nu-latn-cu-bob", // extension sequence without language
|
||||
"hans-cmn-cn", // "hans" could theoretically be a 4-letter language code,
|
||||
// but those can't be followed by extlang codes.
|
||||
"cmn-hans-cn-u-u", // duplicate singleton
|
||||
"cmn-hans-cn-t-u-ca-u", // duplicate singleton
|
||||
"de-gregory-gregory", // duplicate variant
|
||||
"*", // language range
|
||||
"de-*", // language range
|
||||
"中文", // non-ASCII letters
|
||||
"en-ß", // non-ASCII letters
|
||||
"ıd" // non-ASCII letters
|
||||
];
|
||||
var invalidLanguageTags = getInvalidLanguageTags();
|
||||
|
||||
testWithIntlConstructors(function (Constructor) {
|
||||
invalidLanguageTags.forEach(function (tag) {
|
||||
|
|
|
@ -18,15 +18,9 @@ var validLanguageTags = [
|
|||
"cmn-hans-cn", // + ISO 3166-1 country code
|
||||
"es-419", // + UN M.49 region code
|
||||
"es-419-u-nu-latn-cu-bob", // + Unicode locale extension sequence
|
||||
"i-klingon", // grandfathered tag
|
||||
"cmn-hans-cn-t-ca-u-ca-x-t-u", // singleton subtags can also be used as private use subtags
|
||||
"de-gregory-u-ca-gregory", // variant and extension subtags may be the same
|
||||
"aa-a-foo-x-a-foo-bar", // variant subtags can also be used as private use subtags
|
||||
"x-en-US-12345", // anything goes in private use tags
|
||||
"x-12345-12345-en-US",
|
||||
"x-en-US-12345-12345",
|
||||
"x-en-u-foo",
|
||||
"x-en-u-foo-u-bar"
|
||||
];
|
||||
|
||||
testWithIntlConstructors(function (Constructor) {
|
||||
|
|
Loading…
Reference in New Issue