Sort variants added to grandfathered tag

And add an explanation for a previously removed and now re-added assertion.
This commit is contained in:
André Bargull 2019-07-22 02:12:21 -07:00
parent c596d9674e
commit 589ef945fa
1 changed files with 65 additions and 8 deletions

View File

@ -108,7 +108,7 @@ for (const tag of regularGrandfatheredWithExtLang) {
assert.throws(RangeError, () => new Intl.Locale(tag)); assert.throws(RangeError, () => new Intl.Locale(tag));
} }
// Add constiants, extensions, and privateuse subtags to regular grandfathered // Add variants, extensions, and privateuse subtags to regular grandfathered
// language tags and ensure it produces the "expected" result. // language tags and ensure it produces the "expected" result.
const extras = [ const extras = [
"fonipa", "fonipa",
@ -127,14 +127,71 @@ for (const {tag} of regularGrandfathered) {
for (const extra of extras) { for (const extra of extras) {
const loc = new Intl.Locale(tag + "-" + extra); const loc = new Intl.Locale(tag + "-" + extra);
assert.sameValue(loc.maximize().toString(), tagMax + "-" + extra); let canonical = tag + "-" + extra;
assert.sameValue(loc.maximize().maximize().toString(), tagMax + "-" + extra); let canonicalMax = tagMax + "-" + extra;
let canonicalMin = tagMin + "-" + extra;
assert.sameValue(loc.minimize().toString(), tagMin + "-" + extra); // Ensure the added variant subtag is correctly sorted in the canonical tag.
assert.sameValue(loc.minimize().minimize().toString(), tagMin + "-" + extra); if (/^[a-z0-9]{5,8}|[0-9][a-z0-9]{3}$/i.test(extra)) {
const sorted = s => s.replace(/(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+$/i,
assert.sameValue(loc.maximize().minimize().toString(), tagMin + "-" + extra); m => m.split("-").sort().join("-"));
assert.sameValue(loc.minimize().maximize().toString(), tagMax + "-" + extra); canonical = sorted(canonical);
} canonicalMax = sorted(canonicalMax);
canonicalMin = sorted(canonicalMin);
} }
// Adding extra subtags to grandfathered tags can have "interesting" results. Take for
// example "art-lojban" when "fonipa" is added, so we get "art-lojban-fonipa". The first
// step when canonicalising the language tag is to bring it in 'canonical syntax', that
// means among other things sorting variants in alphabetical order. So "art-lojban-fonipa"
// is transformed to "art-fonipa-lojban", because "fonipa" is sorted before "lojban". And
// only after that has happened, we replace aliases with their preferred form.
//
// Now the usual problems arise when doing silly things like adding subtags to
// grandfathered subtags, nobody, neither RFC 5646 nor UTS 35, provides a clear description
// what needs to happen next.
//
// From <http://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier>:
//
// > If the BCP 47 primary language subtag matches the type attribute of a languageAlias
// > element in Supplemental Data, replace the language subtag with the replacement value.
// > 1. ...
// > 2. Five special deprecated grandfathered codes (such as i-default) are in type
// attributes, and are also replaced.
// > 3. ...
//
// So let's assume grandfathered tags are treated as 'primary language subtag' if and only
// if no additional subtags are present. Because in all other cases, we don't really have a
// grandfathered tag, but only some arbitrary combination of random subtags.
//
// Basically what we expect here is that only grandfathered without any additional subtags
// are canonicalised to their modern form and in all other cases they're left as is.
//
// Not all language tag processor will pass this test, for example because they don't order
// variant subtags in alphabetical order or they're too eager when detecting grandfathered
// tags. For example "zh-hakka-hakka" is accepted in some language tag processors, because
// the language tag starts with a prefix which matches a grandfathered tag, and that prefix
// is then canonicalised to "hak" and the second "hakka" is simply appended to it, so the
// resulting tag is "hak-hakka". This is clearly wrong as far as ECMA-402 compliance is
// concerned, because language tags are parsed and validated before any canonicalisation
// happens. And during the validation step an error should be emitted, because the input
// "zh-hakka-hakka" contains two identical variant subtags.
//
// From <https://tc39.es/ecma402/#sec-isstructurallyvalidlanguagetag>:
//
// > does not include duplicate variant subtags
//
// So, if your implementation fails this assertion, but you still like to test the rest of
// this file, a pull request to split this file seems the way to go!
assert.sameValue(loc.toString(), canonical);
assert.sameValue(loc.maximize().toString(), canonicalMax);
assert.sameValue(loc.maximize().maximize().toString(), canonicalMax);
assert.sameValue(loc.minimize().toString(), canonicalMin);
assert.sameValue(loc.minimize().minimize().toString(), canonicalMin);
assert.sameValue(loc.maximize().minimize().toString(), canonicalMin);
assert.sameValue(loc.minimize().maximize().toString(), canonicalMax);
}
}