Sort variants added to grandfathered tag

And add an explanation for a previously removed and now re-added assertion.
2025-07-24 14:35:30 +02:00 · 2019-07-22 02:12:21 -07:00 · 2019-07-22 02:12:21 -07:00 · 589ef945fa
commit 589ef945fa
parent c596d9674e
1 changed files with 65 additions and 8 deletions
--- a/test/intl402/Locale/likely-subtags-grandfathered.js
+++ b/test/intl402/Locale/likely-subtags-grandfathered.js
@ -108,7 +108,7 @@ for (const tag of regularGrandfatheredWithExtLang) {
    assert.throws(RangeError, () => new Intl.Locale(tag));
 }

-// Add constiants, extensions, and privateuse subtags to regular grandfathered
+// Add variants, extensions, and privateuse subtags to regular grandfathered
 // language tags and ensure it produces the "expected" result.
 const extras = [
    "fonipa",
@ -127,14 +127,71 @@ for (const {tag} of regularGrandfathered) {
    for (const extra of extras) {
        const loc = new Intl.Locale(tag + "-" + extra);

-        assert.sameValue(loc.maximize().toString(), tagMax + "-" + extra);
-        assert.sameValue(loc.maximize().maximize().toString(), tagMax + "-" + extra);
+        let canonical = tag + "-" + extra;
+        let canonicalMax = tagMax + "-" + extra;
+        let canonicalMin = tagMin + "-" + extra;

-        assert.sameValue(loc.minimize().toString(), tagMin + "-" + extra);
-        assert.sameValue(loc.minimize().minimize().toString(), tagMin + "-" + extra);
+        // Ensure the added variant subtag is correctly sorted in the canonical tag.
+        if (/^[a-z0-9]{5,8}|[0-9][a-z0-9]{3}$/i.test(extra)) {
+            const sorted = s => s.replace(/(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+$/i,
+                                          m => m.split("-").sort().join("-"));
+            canonical = sorted(canonical);
+            canonicalMax = sorted(canonicalMax);
+            canonicalMin = sorted(canonicalMin);
+        }

-        assert.sameValue(loc.maximize().minimize().toString(), tagMin + "-" + extra);
-        assert.sameValue(loc.minimize().maximize().toString(), tagMax + "-" + extra);
+        // Adding extra subtags to grandfathered tags can have "interesting" results. Take for
+        // example "art-lojban" when "fonipa" is added, so we get "art-lojban-fonipa". The first
+        // step when canonicalising the language tag is to bring it in 'canonical syntax', that
+        // means among other things sorting variants in alphabetical order. So "art-lojban-fonipa"
+        // is transformed to "art-fonipa-lojban", because "fonipa" is sorted before "lojban". And
+        // only after that has happened, we replace aliases with their preferred form.
+        //
+        // Now the usual problems arise when doing silly things like adding subtags to
+        // grandfathered subtags, nobody, neither RFC 5646 nor UTS 35, provides a clear description
+        // what needs to happen next.
+        //
+        // From <http://unicode.org/reports/tr35/#Language_Tag_to_Locale_Identifier>:
+        //
+        // > If the BCP 47 primary language subtag matches the type attribute of a languageAlias
+        // > element in Supplemental Data, replace the language subtag with the replacement value.
+        // >  1. ...
+        // >  2. Five special deprecated grandfathered codes (such as i-default) are in type
+        //       attributes, and are also replaced.
+        // >  3. ...
+        //
+        // So let's assume grandfathered tags are treated as 'primary language subtag' if and only
+        // if no additional subtags are present. Because in all other cases, we don't really have a
+        // grandfathered tag, but only some arbitrary combination of random subtags.
+        //
+        // Basically what we expect here is that only grandfathered without any additional subtags
+        // are canonicalised to their modern form and in all other cases they're left as is.
+        //
+        // Not all language tag processor will pass this test, for example because they don't order
+        // variant subtags in alphabetical order or they're too eager when detecting grandfathered
+        // tags. For example "zh-hakka-hakka" is accepted in some language tag processors, because
+        // the language tag starts with a prefix which matches a grandfathered tag, and that prefix
+        // is then canonicalised to "hak" and the second "hakka" is simply appended to it, so the
+        // resulting tag is "hak-hakka". This is clearly wrong as far as ECMA-402 compliance is
+        // concerned, because language tags are parsed and validated before any canonicalisation
+        // happens. And during the validation step an error should be emitted, because the input
+        // "zh-hakka-hakka" contains two identical variant subtags.
+        //
+        // From <https://tc39.es/ecma402/#sec-isstructurallyvalidlanguagetag>:
+        //
+        // > does not include duplicate variant subtags
+        //
+        // So, if your implementation fails this assertion, but you still like to test the rest of
+        // this file, a pull request to split this file seems the way to go!
+        assert.sameValue(loc.toString(), canonical);
+
+        assert.sameValue(loc.maximize().toString(), canonicalMax);
+        assert.sameValue(loc.maximize().maximize().toString(), canonicalMax);
+
+        assert.sameValue(loc.minimize().toString(), canonicalMin);
+        assert.sameValue(loc.minimize().minimize().toString(), canonicalMin);
+
+        assert.sameValue(loc.maximize().minimize().toString(), canonicalMin);
+        assert.sameValue(loc.minimize().maximize().toString(), canonicalMax);
    }
 }
-