Add the language tag mappings python script

The file is derived from the same-named file for SpiderMonkey, therefore I've kept the MPL license info. The next commits use this script to generate language tag mappings data.
2025-07-31 01:44:54 +02:00 · 2020-03-30 04:55:16 -07:00 · 2020-03-30 04:55:16 -07:00 · 033f31a8ed
commit 033f31a8ed
parent 756ee6a171
1 changed files with 645 additions and 0 deletions
--- a/tools/misc/make_intl_data.py
+++ b/tools/misc/make_intl_data.py
@ -0,0 +1,645 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2020 Mozilla Corporation. All rights reserved.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Original file:
+# https://hg.mozilla.org/mozilla-central/file/tip/js/src/builtin/intl/make_intl_data.py
+
+""" Usage:
+    make_intl_data.py langtags [cldr_core.zip]
+
+
+    Target "langtags":
+    This script extracts information about 1) mappings between deprecated and
+    current Unicode BCP 47 locale identifiers, and 2) deprecated and current
+    BCP 47 Unicode extension value from CLDR.
+"""
+
+from __future__ import print_function
+import os
+import re
+import io
+import sys
+from contextlib import closing
+from functools import partial
+from operator import itemgetter
+from zipfile import ZipFile
+
+if sys.version_info.major == 2:
+    from urllib2 import urlopen
+else:
+    from urllib.request import urlopen
+
+
+def read_supplemental_data(core_file):
+    """ Reads CLDR Supplemental Data and extracts information for Intl.js.
+
+        Information extracted:
+        - grandfatheredMappings: mappings from grandfathered tags to preferred
+          complete language tags
+        - languageMappings: mappings from language subtags to preferred subtags
+        - complexLanguageMappings: mappings from language subtags with complex rules
+        - regionMappings: mappings from region subtags to preferred subtags
+        - complexRegionMappings: mappings from region subtags with complex rules
+        - variantMappings: mappings from variant subtags to preferred subtags
+        Returns these mappings as dictionaries.
+    """
+    import xml.etree.ElementTree as ET
+
+    # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
+    re_unicode_language_id = re.compile(
+        r"""
+        ^
+        # unicode_language_id = unicode_language_subtag
+        #     unicode_language_subtag = alpha{2,3} | alpha{5,8}
+        (?P<language>[a-z]{2,3}|[a-z]{5,8})
+
+        # (sep unicode_script_subtag)?
+        #     unicode_script_subtag = alpha{4}
+        (?:-(?P<script>[a-z]{4}))?
+
+        # (sep unicode_region_subtag)?
+        #     unicode_region_subtag = (alpha{2} | digit{3})
+        (?:-(?P<region>([a-z]{2}|[0-9]{3})))?
+
+        # (sep unicode_variant_subtag)*
+        #     unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+        (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
+        $
+        """, re.IGNORECASE | re.VERBOSE)
+
+    re_unicode_language_subtag = re.compile(
+        r"""
+        ^
+        # unicode_language_subtag = alpha{2,3} | alpha{5,8}
+        ([a-z]{2,3}|[a-z]{5,8})
+        $
+        """, re.IGNORECASE | re.VERBOSE)
+
+    re_unicode_region_subtag = re.compile(
+        r"""
+        ^
+        # unicode_region_subtag = (alpha{2} | digit{3})
+        ([a-z]{2}|[0-9]{3})
+        $
+        """, re.IGNORECASE | re.VERBOSE)
+
+    re_unicode_variant_subtag = re.compile(
+        r"""
+        ^
+        # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+        ([a-z0-9]{5,8}|(?:[0-9][a-z0-9]{3}))
+        $
+        """, re.IGNORECASE | re.VERBOSE)
+
+    # The fixed list of BCP 47 grandfathered language tags.
+    grandfathered_tags = (
+        "art-lojban",
+        "cel-gaulish",
+        "en-GB-oed",
+        "i-ami",
+        "i-bnn",
+        "i-default",
+        "i-enochian",
+        "i-hak",
+        "i-klingon",
+        "i-lux",
+        "i-mingo",
+        "i-navajo",
+        "i-pwn",
+        "i-tao",
+        "i-tay",
+        "i-tsu",
+        "no-bok",
+        "no-nyn",
+        "sgn-BE-FR",
+        "sgn-BE-NL",
+        "sgn-CH-DE",
+        "zh-guoyu",
+        "zh-hakka",
+        "zh-min",
+        "zh-min-nan",
+        "zh-xiang",
+    )
+
+    # The list of grandfathered tags which are valid Unicode BCP 47 locale identifiers.
+    unicode_bcp47_grandfathered_tags = {tag for tag in grandfathered_tags
+                                        if re_unicode_language_id.match(tag)}
+
+    # Dictionary of simple language subtag mappings, e.g. "in" -> "id".
+    language_mappings = {}
+
+    # Dictionary of complex language subtag mappings, modifying more than one
+    # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
+    complex_language_mappings = {}
+
+    # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
+    region_mappings = {}
+
+    # Dictionary of complex region subtag mappings, containing more than one
+    # replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]).
+    complex_region_mappings = {}
+
+    # Dictionary of aliased variant subtags to a tuple of preferred replacement
+    # type and replacement, e.g. "arevela" -> ("language", "hy") or
+    # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
+    variant_mappings = {}
+
+    # Dictionary of grandfathered mappings to preferred values.
+    grandfathered_mappings = {}
+
+    # CLDR uses "_" as the separator for some elements. Replace it with "-".
+    def bcp47_id(cldr_id):
+        return cldr_id.replace("_", "-")
+
+    # CLDR uses the canonical case for most entries, but there are some
+    # exceptions, like:
+    #   <languageAlias type="drw" replacement="fa_af" reason="deprecated"/>
+    # Therefore canonicalize all tags to be on the safe side.
+    def bcp47_canonical(language, script, region):
+        # Canonical case for language subtags is lower case.
+        # Canonical case for script subtags is title case.
+        # Canonical case for region subtags is upper case.
+        return (language.lower() if language else None,
+                script.title() if script else None,
+                region.upper() if region else None)
+
+    tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
+
+    for language_alias in tree.iterfind(".//languageAlias"):
+        type = bcp47_id(language_alias.get("type"))
+        replacement = bcp47_id(language_alias.get("replacement"))
+
+        # Handle grandfathered mappings first.
+        if type in unicode_bcp47_grandfathered_tags:
+            grandfathered_mappings[type] = replacement
+            continue
+
+        # We're only interested in language subtag matches, so ignore any
+        # entries which have additional subtags.
+        if re_unicode_language_subtag.match(type) is None:
+            continue
+
+        assert type.islower()
+
+        if re_unicode_language_subtag.match(replacement) is not None:
+            # Canonical case for language subtags is lower-case.
+            language_mappings[type] = replacement.lower()
+        else:
+            replacement_match = re_unicode_language_id.match(replacement)
+            assert replacement_match is not None, (
+                   "{} invalid Unicode BCP 47 locale identifier".format(replacement))
+            assert replacement_match.group("variants") is None, (
+                   "{}: unexpected variant subtags in {}".format(type, replacement))
+
+            complex_language_mappings[type] = bcp47_canonical(replacement_match.group("language"),
+                                                              replacement_match.group("script"),
+                                                              replacement_match.group("region"))
+
+    for territory_alias in tree.iterfind(".//territoryAlias"):
+        type = territory_alias.get("type")
+        replacement = territory_alias.get("replacement")
+
+        # We're only interested in region subtag matches, so ignore any entries
+        # which contain legacy formats, e.g. three letter region codes.
+        if re_unicode_region_subtag.match(type) is None:
+            continue
+
+        assert type.isupper() or type.isdigit()
+
+        if re_unicode_region_subtag.match(replacement) is not None:
+            # Canonical case for region subtags is upper-case.
+            region_mappings[type] = replacement.upper()
+        else:
+            # Canonical case for region subtags is upper-case.
+            replacements = [r.upper() for r in replacement.split(" ")]
+            assert all(
+                re_unicode_region_subtag.match(loc) is not None for loc in replacements
+            ), "{} invalid region subtags".format(replacement)
+            complex_region_mappings[type] = replacements
+
+    for variant_alias in tree.iterfind(".//variantAlias"):
+        type = variant_alias.get("type")
+        replacement = variant_alias.get("replacement")
+
+        assert re_unicode_variant_subtag.match(type) is not None, (
+               "{} invalid variant subtag".format(type))
+
+        # Normalize the case, because some variants are in upper case.
+        type = type.lower()
+
+        # The replacement can be a language, a region, or a variant subtag.
+        # Language and region subtags are case normalized, variant subtags can
+        # be in any case.
+
+        if re_unicode_language_subtag.match(replacement) is not None and replacement.islower():
+            variant_mappings[type] = ("language", replacement)
+
+        elif re_unicode_region_subtag.match(replacement) is not None:
+            assert replacement.isupper() or replacement.isdigit(), (
+                   "{} invalid variant subtag replacement".format(replacement))
+            variant_mappings[type] = ("region", replacement)
+
+        else:
+            assert re_unicode_variant_subtag.match(replacement) is not None, (
+                   "{} invalid variant subtag replacement".format(replacement))
+            variant_mappings[type] = ("variant", replacement.lower())
+
+    tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
+
+    likely_subtags = {}
+
+    for likely_subtag in tree.iterfind(".//likelySubtag"):
+        from_tag = bcp47_id(likely_subtag.get("from"))
+        from_match = re_unicode_language_id.match(from_tag)
+        assert from_match is not None, (
+               "{} invalid Unicode BCP 47 locale identifier".format(from_tag))
+        assert from_match.group("variants") is None, (
+               "unexpected variant subtags in {}".format(from_tag))
+
+        to_tag = bcp47_id(likely_subtag.get("to"))
+        to_match = re_unicode_language_id.match(to_tag)
+        assert to_match is not None, (
+               "{} invalid Unicode BCP 47 locale identifier".format(to_tag))
+        assert to_match.group("variants") is None, (
+               "unexpected variant subtags in {}".format(to_tag))
+
+        from_canonical = bcp47_canonical(from_match.group("language"),
+                                         from_match.group("script"),
+                                         from_match.group("region"))
+
+        to_canonical = bcp47_canonical(to_match.group("language"),
+                                       to_match.group("script"),
+                                       to_match.group("region"))
+
+        likely_subtags[from_canonical] = to_canonical
+
+    complex_region_mappings_final = {}
+
+    for (deprecated_region, replacements) in complex_region_mappings.items():
+        # Find all likely subtag entries which don't already contain a region
+        # subtag and whose target region is in the list of replacement regions.
+        region_likely_subtags = [(from_language, from_script, to_region)
+                                 for ((from_language, from_script, from_region),
+                                      (_, _, to_region)) in likely_subtags.items()
+                                 if from_region is None and to_region in replacements]
+
+        # The first replacement entry is the default region.
+        default = replacements[0]
+
+        # Find all likely subtag entries whose region matches the default region.
+        default_replacements = {(language, script)
+                                for (language, script, region) in region_likely_subtags
+                                if region == default}
+
+        # And finally find those entries which don't use the default region.
+        # These are the entries we're actually interested in, because those need
+        # to be handled specially when selecting the correct preferred region.
+        non_default_replacements = [(language, script, region)
+                                    for (language, script, region) in region_likely_subtags
+                                    if (language, script) not in default_replacements]
+
+        # If there are no non-default replacements, we can handle the region as
+        # part of the simple region mapping.
+        if non_default_replacements:
+            complex_region_mappings_final[deprecated_region] = (default, non_default_replacements)
+        else:
+            region_mappings[deprecated_region] = default
+
+    return {"grandfatheredMappings": grandfathered_mappings,
+            "languageMappings": language_mappings,
+            "complexLanguageMappings": complex_language_mappings,
+            "regionMappings": region_mappings,
+            "complexRegionMappings": complex_region_mappings_final,
+            "variantMappings": variant_mappings,
+            }
+
+
+def read_unicode_extensions(core_file):
+    import xml.etree.ElementTree as ET
+
+    # Match all xml-files in the BCP 47 directory.
+    bcp_file_re = re.compile(r"^common/bcp47/.+\.xml$")
+
+    # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
+    #
+    # type = alphanum{3,8} (sep alphanum{3,8})* ;
+    type_re = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
+
+    # Mapping from Unicode extension types to dict of deprecated to
+    # preferred values.
+    mapping = {
+        # Unicode BCP 47 U Extension
+        "u": {},
+
+        # Unicode BCP 47 T Extension
+        "t": {},
+    }
+
+    def read_bcp47_file(file):
+        tree = ET.parse(file)
+        for keyword in tree.iterfind(".//keyword/key"):
+            extension = keyword.get("extension", "u")
+            assert extension == "u" or extension == "t", (
+                   "unknown extension type: {}".format(extension))
+
+            extension_name = keyword.get("name")
+
+            for type in keyword.iterfind("type"):
+                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+                #
+                # The key or type name used by Unicode locale extension with 'u' extension
+                # syntax or the 't' extensions syntax. When alias below is absent, this name
+                # can be also used with the old style "@key=type" syntax.
+                name = type.get("name")
+
+                # Ignore the special name:
+                # - <https://unicode.org/reports/tr35/#CODEPOINTS>
+                # - <https://unicode.org/reports/tr35/#REORDER_CODE>
+                # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
+                # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
+                # - <https://unicode.org/reports/tr35/#PRIVATE_USE>
+                if name in ("CODEPOINTS", "REORDER_CODE", "RG_KEY_VALUE", "SUBDIVISION_CODE",
+                            "PRIVATE_USE"):
+                    continue
+
+                # All other names should match the 'type' production.
+                assert type_re.match(name) is not None, (
+                       "{} matches the 'type' production".format(name))
+
+                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+                #
+                # The preferred value of the deprecated key, type or attribute element.
+                # When a key, type or attribute element is deprecated, this attribute is
+                # used for specifying a new canonical form if available.
+                preferred = type.get("preferred")
+
+                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+                #
+                # The BCP 47 form is the canonical form, and recommended. Other aliases are
+                # included only for backwards compatibility.
+                alias = type.get("alias")
+
+                # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
+                #
+                # Use the bcp47 data to replace keys, types, tfields, and tvalues by their
+                # canonical forms. See Section 3.6.4 U Extension Data Files) and Section
+                # 3.7.1 T Extension Data Files. The aliases are in the alias attribute
+                # value, while the canonical is in the name attribute value.
+
+                # 'preferred' contains the new preferred name, 'alias' the compatibility
+                # name, but then there's this entry where 'preferred' and 'alias' are the
+                # same. So which one to choose? Assume 'preferred' is the actual canonical
+                # name.
+                #
+                # <type name="islamicc"
+                #       description="Civil (algorithmic) Arabic calendar"
+                #       deprecated="true"
+                #       preferred="islamic-civil"
+                #       alias="islamic-civil"/>
+
+                if preferred is not None:
+                    assert type_re.match(preferred), preferred
+                    mapping[extension].setdefault(extension_name, {})[name] = preferred
+
+                if alias is not None:
+                    for alias_name in alias.lower().split(" "):
+                        # Ignore alias entries which don't match the 'type' production.
+                        if type_re.match(alias_name) is None:
+                            continue
+
+                        # See comment above when 'alias' and 'preferred' are both present.
+                        if (preferred is not None and
+                            name in mapping[extension][extension_name]):
+                            continue
+
+                        # Skip over entries where 'name' and 'alias' are equal.
+                        #
+                        # <type name="pst8pdt"
+                        #       description="POSIX style time zone for US Pacific Time"
+                        #       alias="PST8PDT"
+                        #       since="1.8"/>
+                        if name == alias_name:
+                            continue
+
+                        mapping[extension].setdefault(extension_name, {})[alias_name] = name
+
+    def read_supplemental_metadata(file):
+        # Find subdivision and region replacements.
+        #
+        # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
+        #
+        # Replace aliases in special key values:
+        #   - If there is an 'sd' or 'rg' key, replace any subdivision alias
+        #     in its value in the same way, using subdivisionAlias data.
+        tree = ET.parse(file)
+        for alias in tree.iterfind(".//subdivisionAlias"):
+            type = alias.get("type")
+            assert type_re.match(type) is not None, (
+                   "{} matches the 'type' production".format(type))
+
+            # Take the first replacement when multiple ones are present.
+            replacement = alias.get("replacement").split(" ")[0].lower()
+
+            # Skip over invalid replacements.
+            #
+            # <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/>
+            #
+            # It's not entirely clear to me if CLDR actually wants to use
+            # "axzzzz" as the replacement for this case.
+            if type_re.match(replacement) is None:
+                continue
+
+            # 'subdivisionAlias' applies to 'rg' and 'sd' keys.
+            mapping["u"].setdefault("rg", {})[type] = replacement
+            mapping["u"].setdefault("sd", {})[type] = replacement
+
+    for name in core_file.namelist():
+        if bcp_file_re.match(name):
+            read_bcp47_file(core_file.open(name))
+
+    read_supplemental_metadata(core_file.open("common/supplemental/supplementalMetadata.xml"))
+
+    return {
+        "unicodeMappings": mapping["u"],
+        "transformMappings": mapping["t"],
+    }
+
+
+def write_simple_mappings(println, name, mappings):
+    println(u"var {} = {{".format(name))
+
+    for (key, value) in sorted(mappings.items(), key=itemgetter(0)):
+        println(u"""  "{}": "{}",""".format(key, value))
+
+    println(u"};")
+
+
+def write_complex_language_mappings(println, mappings):
+    println(u"var __complexLanguageMappings = {")
+
+    def maybe_subtag(name, subtag):
+        if subtag is None:
+            return u""
+        return u""", {}: "{}\"""".format(name, subtag)
+
+    for (deprecated_language, (language, script, region)) in (
+        sorted(mappings.items(), key=itemgetter(0))
+    ):
+        println(u"""  "{}": {{language: "{}"{}{}}},""".format(deprecated_language, language,
+                                                              maybe_subtag("script", script),
+                                                              maybe_subtag("region", region)))
+
+    println(u"};")
+
+
+def write_complex_region_mappings(println, mappings):
+    println(u"var __complexRegionMappings = {")
+
+    def maybe_subtag(name, subtag):
+        if subtag is None:
+            return u""
+        return u"""{}: "{}", """.format(name, subtag)
+
+    for (deprecated_region, (default, non_default_replacements)) in (
+        sorted(mappings.items(), key=itemgetter(0))
+    ):
+        println(u"""  "{}": {{""".format(deprecated_region))
+        println(u"""    default: "{}",""".format(default))
+
+        for (language, script, region) in sorted(non_default_replacements, key=itemgetter(0, 1)):
+            mapping_key = language
+            if script is not None:
+                mapping_key += "-" + script
+
+            println(u"""    "{}": "{}",""".format(mapping_key, region))
+
+        println(u"  },")
+
+    println(u"};")
+
+
+def write_variant_mappings(println, mappings):
+    println(u"var __variantMappings = {")
+
+    for (deprecated_variant, (type, replacement)) in sorted(mappings.items(), key=itemgetter(0)):
+        println(u"""  "{}": {{type: "{}", replacement: "{}"}},""".format(deprecated_variant, type,
+                                                                         replacement))
+
+    println(u"};")
+
+
+def write_unicode_extension_mappings(println, mapping, extension):
+    println(u"var __{}Mappings = {{".format(extension))
+
+    for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
+        println(u"""  "{}": {{""".format(key))
+
+        for (type, replacement) in sorted(replacements.items(), key=itemgetter(0)):
+            println(u"""    "{}": "{}",""".format(type, replacement))
+        println(u"  },")
+
+    println(u"};")
+
+
+def write_cldr_language_tag_data(println, data, url):
+    language_mappings = data["languageMappings"]
+    complex_language_mappings = data["complexLanguageMappings"]
+    region_mappings = data["regionMappings"]
+    complex_region_mappings = data["complexRegionMappings"]
+    variant_mappings = data["variantMappings"]
+    unicode_mappings = data["unicodeMappings"]
+    transform_mappings = data["transformMappings"]
+
+    write_simple_mappings(println, "__languageMappings", language_mappings)
+    write_simple_mappings(println, "__regionMappings", region_mappings)
+
+    write_complex_language_mappings(println, complex_language_mappings)
+    write_complex_region_mappings(println, complex_region_mappings)
+
+    write_variant_mappings(println, variant_mappings)
+
+    write_unicode_extension_mappings(println, unicode_mappings, "unicode")
+    write_unicode_extension_mappings(println, transform_mappings, "transform")
+
+
+def update_cldr_lang_tags(args):
+    """ Generate the language tag mapping objects. """
+    version = args.version
+    url = args.url
+    out = args.out
+    filename = args.file
+
+    url = url.replace("<VERSION>", version)
+
+    print("Arguments:")
+    print("\tCLDR version: %s" % version)
+    print("\tDownload url: %s" % url)
+    if filename is not None:
+        print("\tLocal CLDR core.zip file: %s" % filename)
+    print("\tOutput file: %s" % out)
+    print("")
+
+    data = {
+        "version": version,
+    }
+
+    def read_files(cldr_file):
+        with ZipFile(cldr_file) as zip_file:
+            data.update(read_supplemental_data(zip_file))
+            data.update(read_unicode_extensions(zip_file))
+
+    print("Processing CLDR data...")
+    if filename is not None:
+        print("Always make sure you have the newest CLDR core.zip!")
+        with open(filename, "rb") as cldr_file:
+            read_files(cldr_file)
+    else:
+        print("Downloading CLDR core.zip...")
+        with closing(urlopen(url)) as cldr_file:
+            cldr_data = io.BytesIO(cldr_file.read())
+            read_files(cldr_data)
+
+    print("Writing Intl data...")
+    if out == "stdout":
+        out = sys.stdout.fileno()
+    with io.open(out, mode="w", encoding="utf-8", newline="") as f:
+        println = partial(print, file=f)
+
+        write_cldr_language_tag_data(println, data, url)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    def ensure_https(v):
+        if not v.startswith("https:"):
+            raise argparse.ArgumentTypeError("URL protocol must be https: " % v)
+        return v
+
+    parser = argparse.ArgumentParser(description="Update CLDR language tags data.")
+
+    parser.add_argument("--version",
+                        metavar="VERSION",
+                        required=True,
+                        help="CLDR version number")
+    parser.add_argument("--url",
+                        metavar="URL",
+                        default="https://unicode.org/Public/cldr/<VERSION>/core.zip",
+                        type=ensure_https,
+                        help="Download url CLDR data (default: %(default)s)")
+    parser.add_argument("--out",
+                        default="stdout",
+                        help="Output file (default: %(default)s)")
+    parser.add_argument("file",
+                        nargs="?",
+                        help="Local cldr-core.zip file, if omitted uses <URL>")
+    parser.set_defaults(func=update_cldr_lang_tags)
+
+    args = parser.parse_args()
+    args.func(args)