Update tests to parse language tags as Unicode BCP 47 Locale Identifiers

harness/testIntl.js
- Add now invalid tags to getInvalidLanguageTags, these tags were previously used in test files changed in this commit.
- Update isCanonicalizedStructurallyValidLanguageTag regular expressions.

test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js
- Moved five now invalid tags to getInvalidLanguageTags function in testIntl.js

test/intl402/Intl/getCanonicalLocales/preferred-grandfathered.js
- All irregular grandfathered tags are invalid now
- Regular grandfathered with extlang subtags are now also invalid
- Regular grandfathered with variant-like subtags are still valid

test/intl402/Intl/getCanonicalLocales/weird-cases.js
- Revert changes from last commit
- "x-u-foo" is now invalid and was moved to getInvalidLanguageTags function

test/intl402/ListFormat/constructor/constructor/locales-valid.js
test/intl402/RelativeTimeFormat/constructor/constructor/locales-valid.js
test/intl402/Segmenter/constructor/constructor/locales-valid.js
- Irregular grandfathered and privateuse only are no longer valid language tags

test/intl402/language-tags-canonicalized.js
- Same changes as in test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js

test/intl402/language-tags-invalid.js
- Invalid tags list in this file was a subset of getInvalidLanguageTags, so replaced with getInvalidLanguageTags to get more coverage

test/intl402/language-tags-valid.js
- Same changes as in test/intl402/Intl/getCanonicalLocales/canonicalized-tags.js
This commit is contained in:
André Bargull 2019-03-11 10:33:40 -07:00
parent 94053978bc
commit f7e8dba39b
10 changed files with 129 additions and 102 deletions

View File

@ -189,6 +189,19 @@ function getInvalidLanguageTags() {
"de-1996-1996", // duplicate numeric variant
"pt-u-ca-gregory-u-nu-latn", // duplicate singleton subtag
// Invalid tags starting with: https://github.com/tc39/ecma402/pull/289
"no-nyn", // regular grandfathered in BCP47, but invalid in UTS35
"i-klingon", // irregular grandfathered in BCP47, but invalid in UTS35
"zh-hak-CN", // language with extlang in BCP47, but invalid in UTS35
"sgn-ils", // language with extlang in BCP47, but invalid in UTS35
"x-foo", // privateuse-only in BCP47, but invalid in UTS35
"x-en-US-12345", // more privateuse-only variants.
"x-12345-12345-en-US",
"x-en-US-12345-12345",
"x-en-u-foo",
"x-en-u-foo-u-bar",
"x-u-foo",
// underscores in different parts of the language tag
"de_DE",
"DE_de",
@ -238,27 +251,32 @@ function getInvalidLanguageTags() {
function isCanonicalizedStructurallyValidLanguageTag(locale) {
/**
* Regular expression defining BCP 47 language tags.
* Regular expression defining Unicode BCP 47 Locale Identifiers.
*
* Spec: RFC 5646 section 2.1.
* Spec: https://unicode.org/reports/tr35/#Unicode_locale_identifier
*/
var alpha = "[a-zA-Z]",
var alpha = "[a-z]",
digit = "[0-9]",
alphanum = "(" + alpha + "|" + digit + ")",
regular = "(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)",
irregular = "(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)",
grandfathered = "(" + irregular + "|" + regular + ")",
privateuse = "(x(-[a-z0-9]{1,8})+)",
singleton = "(" + digit + "|[A-WY-Za-wy-z])",
extension = "(" + singleton + "(-" + alphanum + "{2,8})+)",
variant = "(" + alphanum + "{5,8}|(" + digit + alphanum + "{3}))",
region = "(" + alpha + "{2}|" + digit + "{3})",
script = "(" + alpha + "{4})",
extlang = "(" + alpha + "{3}(-" + alpha + "{3}){0,2})",
language = "(" + alpha + "{2,3}(-" + extlang + ")?|" + alpha + "{4}|" + alpha + "{5,8})",
langtag = language + "(-" + script + ")?(-" + region + ")?(-" + variant + ")*(-" + extension + ")*(-" + privateuse + ")?",
languageTag = "^(" + langtag + "|" + privateuse + "|" + grandfathered + ")$",
language = "(" + alpha + "{2,3}|" + alpha + "{5,8})",
privateuse = "(x(-[a-z0-9]{1,8})+)",
singleton = "(" + digit + "|[a-wy-z])",
attribute= "(" + alphanum + "{3,8})",
keyword = "(" + alphanum + alpha + "(-" + alphanum + "{3,8})*)",
unicode_locale_extensions = "(u((-" + keyword + ")+|((-" + attribute + ")+(-" + keyword + ")*)))",
tlang = "(" + language + "(-" + script + ")?(-" + region + ")?(-" + variant + ")*)",
tfield = "(" + alpha + digit + "(-" + alphanum + "{3,8})+)",
transformed_extensions = "(t((-" + tlang + "(-" + tfield + ")*)|(-" + tfield + ")+))",
other_singleton = "(" + digit + "|[a-sv-wy-z])",
other_extensions = "(" + other_singleton + "(-" + alphanum + "{2,8})+)",
extension = "(" + unicode_locale_extensions + "|" + transformed_extensions + "|" + other_extensions + ")",
locale_id = language + "(-" + script + ")?(-" + region + ")?(-" + variant + ")*(-" + extension + ")*(-" + privateuse + ")?",
languageTag = "^(" + locale_id + ")$",
languageTagRE = new RegExp(languageTag, "i");
var duplicateSingleton = "-" + singleton + "-(.*-)?\\1(?!" + alphanum + ")",
duplicateSingletonRE = new RegExp(duplicateSingleton, "i"),
duplicateVariant = "(" + alphanum + "{2,8}-)+" + variant + "-(" + alphanum + "{2,8}-)*\\3(?!" + alphanum + ")",
@ -266,7 +284,7 @@ function isCanonicalizedStructurallyValidLanguageTag(locale) {
/**
* Verifies that the given string is a well-formed BCP 47 language tag
* Verifies that the given string is a well-formed Unicode BCP 47 Locale Identifier
* with no duplicate variant or singleton subtags.
*
* Spec: ECMAScript Internationalization API Specification, draft, 6.2.2.

View File

@ -37,15 +37,10 @@ var canonicalizedTags = {
"es-419-u-nu-latn": "es-419-u-nu-latn",
"cmn-hans-cn-u-ca-t-ca-x-t-u": "zh-Hans-CN-t-ca-u-ca-x-t-u",
"de-gregory-u-ca-gregory": "de-gregory-u-ca-gregory",
"no-nyn": "nn",
"i-klingon": "tlh",
"sgn-GR": "gss",
"ji": "yi",
"de-DD": "de-DE",
"zh-hak-CN": "hak-CN",
"sgn-ils": "ils",
"in": "id",
"x-foo": "und-x-foo",
"sr-cyrl-ekavsk": "sr-Cyrl-ekavsk",
"en-ca-newfound": "en-CA-newfound",
"sl-rozaj-biske-1994": "sl-rozaj-biske-1994",

View File

@ -20,67 +20,77 @@ info: |
...
6.2.3 CanonicalizeLanguageTag ( locale )
The CanonicalizeLanguageTag abstract operation returns the canonical and case-regularized
form of the locale argument (which must be a String value that is a structurally valid
BCP 47 language tag as verified by the IsStructurallyValidLanguageTag abstract operation).
A conforming implementation shall take the steps specified in RFC 5646 section 4.5, or
successor, to bring the language tag into canonical form, and to regularize the case of
the subtags. Furthermore, a conforming implementation shall not take the steps to bring
a language tag into "extlang form", nor shall it reorder variant subtags.
The specifications for extensions to BCP 47 language tags, such as RFC 6067, may include
canonicalization rules for the extension subtag sequences they define that go beyond the
canonicalization rules of RFC 5646 section 4.5. Implementations are allowed, but not
required, to apply these additional rules.
The CanonicalizeLanguageTag abstract operation returns the canonical and case-regularized form
of the locale argument (which must be a String value that is a structurally valid Unicode
BCP 47 Locale Identifier as verified by the IsStructurallyValidLanguageTag abstract operation).
A conforming implementation shall take the steps specified in the BCP 47 Language Tag to
Unicode BCP 47 Locale Identifier algorithm, from Unicode Technical Standard #35 LDML
§ 3.3.1 BCP 47 Language Tag Conversion.
includes: [testIntl.js]
---*/
// Generated from http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
// File-Date: 2017-08-15
var canonicalizedTags = {
// Irregular tags.
"en-gb-oed": "en-GB-oxendict",
"i-ami": "ami",
"i-bnn": "bnn",
"i-default": "und-x-i-default",
"i-enochian": "und-x-i-enochian",
"i-hak": "hak",
"i-klingon": "tlh",
"i-lux": "lb",
"i-mingo": "und-x-i-mingo",
"i-navajo": "nv",
"i-pwn": "pwn",
"i-tao": "tao",
"i-tay": "tay",
"i-tsu": "tsu",
"sgn-be-fr": "sfb",
"sgn-be-nl": "vgt",
"sgn-ch-de": "sgg",
// Regular tags.
var irregularGrandfathered = [
"en-gb-oed",
"i-ami",
"i-bnn",
"i-default",
"i-enochian",
"i-hak",
"i-klingon",
"i-lux",
"i-mingo",
"i-navajo",
"i-pwn",
"i-tao",
"i-tay",
"i-tsu",
"sgn-be-fr",
"sgn-be-nl",
"sgn-ch-de",
];
var regularGrandfatheredNonUTS35 = [
"no-bok",
"no-nyn",
"zh-min",
"zh-min-nan",
];
var regularGrandfatheredUTS35 = {
"art-lojban": "jbo",
"cel-gaulish": "und-x-cel-gaulish",
"no-bok": "nb",
"no-nyn": "nn",
"zh-guoyu": "zh",
"zh-hakka": "hak",
"zh-min": "und-x-zh-min",
"zh-min-nan": "nan",
"zh-xiang": "hsn",
};
// make sure the data above is correct
Object.getOwnPropertyNames(canonicalizedTags).forEach(function (tag) {
var canonicalizedTag = canonicalizedTags[tag];
irregularGrandfathered.forEach(function (tag) {
assert.sameValue(
isCanonicalizedStructurallyValidLanguageTag(tag), false,
"Test data \"" + tag + "\" is not a structurally valid language tag."
);
});
regularGrandfatheredNonUTS35.forEach(function (tag) {
assert.sameValue(
isCanonicalizedStructurallyValidLanguageTag(tag), false,
"Test data \"" + tag + "\" is not a structurally valid language tag."
);
});
Object.getOwnPropertyNames(regularGrandfatheredUTS35).forEach(function (tag) {
var canonicalizedTag = regularGrandfatheredUTS35[tag];
assert(
isCanonicalizedStructurallyValidLanguageTag(canonicalizedTag),
"Test data \"" + canonicalizedTag + "\" is not canonicalized and structurally valid language tag."
"Test data \"" + canonicalizedTag + "\" is a canonicalized and structurally valid language tag."
);
});
Object.getOwnPropertyNames(canonicalizedTags).forEach(function (tag) {
Object.getOwnPropertyNames(regularGrandfatheredUTS35).forEach(function (tag) {
var canonicalLocales = Intl.getCanonicalLocales(tag);
assert.sameValue(canonicalLocales.length, 1);
assert.sameValue(canonicalLocales[0], canonicalizedTags[tag]);
assert.sameValue(canonicalLocales[0], regularGrandfatheredUTS35[tag]);
});

View File

@ -13,13 +13,12 @@ includes: [compareArray.js]
var weirdCases =
[
{locale: "x-u-foo", canonical: "und-x-u-foo"},
{locale: "en-x-u-foo"},
{locale: "en-a-bar-x-u-foo"},
{locale: "en-x-u-foo-a-bar"},
{locale: "en-a-bar-u-baz-x-u-foo"},
"en-x-u-foo",
"en-a-bar-x-u-foo",
"en-x-u-foo-a-bar",
"en-a-bar-u-baz-x-u-foo",
];
weirdCases.forEach(function ({locale, canonical = locale}) {
assert(compareArray(Intl.getCanonicalLocales(locale), [canonical]));
weirdCases.forEach(function (weird) {
assert(compareArray(Intl.getCanonicalLocales(weird), [weird]));
});

View File

@ -17,17 +17,28 @@ const tests = [
[undefined, defaultLocale, "undefined"],
["EN", "en", "Single value"],
[[], defaultLocale, "Empty array"],
[["en-GB-oed"], "en-GB", "Grandfathered"],
[["x-private"], defaultLocale, "Private", ["lookup"]],
[["en", "EN"], "en", "Duplicate value (canonical first)"],
[["EN", "en"], "en", "Duplicate value (canonical last)"],
[{ 0: "DE", length: 0 }, defaultLocale, "Object with zero length"],
[{ 0: "DE", length: 1 }, "de", "Object with length"],
];
const errorTests = [
[["en-GB-oed"], "Grandfathered"],
[["x-private"], "Private", ["lookup"]],
];
for (const [locales, expected, name, matchers = ["lookup", "best fit"]] of tests) {
for (const matcher of matchers) {
const rtf = new Intl.ListFormat(locales, {localeMatcher: matcher});
assert.sameValue(rtf.resolvedOptions().locale, expected, name);
}
}
for (const [locales, name, matchers = ["lookup", "best fit"]] of errorTests) {
for (const matcher of matchers) {
assert.throws(RangeError, function() {
new Intl.ListFormat(locales, {localeMatcher: matcher})
}, name);
}
}

View File

@ -17,17 +17,28 @@ const tests = [
[undefined, defaultLocale, "undefined"],
["EN", "en", "Single value"],
[[], defaultLocale, "Empty array"],
[["en-GB-oed"], "en-GB", "Grandfathered"],
[["x-private"], defaultLocale, "Private", ["lookup"]],
[["en", "EN"], "en", "Duplicate value (canonical first)"],
[["EN", "en"], "en", "Duplicate value (canonical last)"],
[{ 0: "DE", length: 0 }, defaultLocale, "Object with zero length"],
[{ 0: "DE", length: 1 }, "de", "Object with length"],
];
const errorTests = [
[["en-GB-oed"], "Grandfathered"],
[["x-private"], "Private", ["lookup"]],
];
for (const [locales, expected, name, matchers = ["best fit", "lookup"]] of tests) {
for (const matcher of matchers) {
const rtf = new Intl.RelativeTimeFormat(locales, {localeMatcher: matcher});
assert.sameValue(rtf.resolvedOptions().locale, expected, name);
}
}
for (const [locales, name, matchers = ["best fit", "lookup"]] of errorTests) {
for (const matcher of matchers) {
assert.throws(RangeError, function() {
new Intl.RelativeTimeFormat(locales, {localeMatcher: matcher});
}, name);
}
}

View File

@ -21,8 +21,6 @@ const tests = [
[["sr"], ["sr"], "Single-element array"],
[["fr", "ar"], ["fr", "ar"], "Two-element array"],
[["xyz", "ar"], ["ar"], "Two-element array with unknown code"],
[["en-GB-oed"], ["en-GB"], "Grandfathered"],
[["x-private"], [defaultLocale], "Private", ["lookup"]],
[["en", "EN"], ["en"], "Duplicate value (canonical first)"],
[["EN", "en"], ["en"], "Duplicate value (canonical last)"],
[{ 0: "DE", length: 0 }, [defaultLocale], "Object with zero length"],
@ -31,6 +29,11 @@ const tests = [
[{ 1: "ja", 2: "fr" }, [defaultLocale], "Object without length, indexed from 1"],
];
const errorTests = [
[["en-GB-oed"], "Grandfathered"],
[["x-private"], "Private", ["lookup"]],
];
for (const [locales, expected, name, matchers = ["best fit", "lookup"]] of tests) {
for (const localeMatcher of matchers) {
const segmenter = new Intl.Segmenter(locales, { localeMatcher });
@ -38,3 +41,11 @@ for (const [locales, expected, name, matchers = ["best fit", "lookup"]] of tests
assert(expected.includes(actual), `${name}: expected one of ${expected}, found ${actual}`);
}
}
for (const [locales, name, matchers = ["best fit", "lookup"]] of errorTests) {
for (const localeMatcher of matchers) {
assert.throws(RangeError, function() {
new Intl.Segmenter(locales, { localeMatcher });
}, name);
}
}

View File

@ -21,15 +21,10 @@ var canonicalizedTags = {
// -u-ca is incomplete, so it will not show up in resolvedOptions().locale
"cmn-hans-cn-u-ca-t-ca-x-t-u": ["zh-Hans-CN-t-ca-u-ca-x-t-u", "zh-Hans-CN-t-ca-x-t-u", "zh-Hans-CN-t-ca-x-t", "zh-Hans-CN-t-ca", "zh-Hans-CN", "zh-Hans", "zh"],
"de-gregory-u-ca-gregory": ["de-gregory-u-ca-gregory", "de-gregory", "de-u-ca-gregory", "de"],
"no-nyn": ["nn"],
"i-klingon": ["tlh"],
"sgn-GR": ["gss"],
"ji": ["yi"],
"de-DD": ["de-DE", "de"],
"zh-hak-CN": ["hak-CN", "hak"],
"sgn-ils": ["ils"],
"in": ["id"],
"x-foo": ["x-foo"]
};
// make sure the data above is correct

View File

@ -10,24 +10,7 @@ author: Norbert Lindenberg
includes: [testIntl.js]
---*/
var invalidLanguageTags = [
"", // empty tag
"i", // singleton alone
"x", // private use without subtag
"u", // extension singleton in first place
"419", // region code in first place
"u-nu-latn-cu-bob", // extension sequence without language
"hans-cmn-cn", // "hans" could theoretically be a 4-letter language code,
// but those can't be followed by extlang codes.
"cmn-hans-cn-u-u", // duplicate singleton
"cmn-hans-cn-t-u-ca-u", // duplicate singleton
"de-gregory-gregory", // duplicate variant
"*", // language range
"de-*", // language range
"中文", // non-ASCII letters
"en-ß", // non-ASCII letters
"ıd" // non-ASCII letters
];
var invalidLanguageTags = getInvalidLanguageTags();
testWithIntlConstructors(function (Constructor) {
invalidLanguageTags.forEach(function (tag) {

View File

@ -18,15 +18,9 @@ var validLanguageTags = [
"cmn-hans-cn", // + ISO 3166-1 country code
"es-419", // + UN M.49 region code
"es-419-u-nu-latn-cu-bob", // + Unicode locale extension sequence
"i-klingon", // grandfathered tag
"cmn-hans-cn-t-ca-u-ca-x-t-u", // singleton subtags can also be used as private use subtags
"de-gregory-u-ca-gregory", // variant and extension subtags may be the same
"aa-a-foo-x-a-foo-bar", // variant subtags can also be used as private use subtags
"x-en-US-12345", // anything goes in private use tags
"x-12345-12345-en-US",
"x-en-US-12345-12345",
"x-en-u-foo",
"x-en-u-foo-u-bar"
];
testWithIntlConstructors(function (Constructor) {