test262/tools/misc/make_intl_data.py

646 lines
25 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2020 Mozilla Corporation. All rights reserved.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Original file:
# https://hg.mozilla.org/mozilla-central/file/tip/js/src/builtin/intl/make_intl_data.py
""" Usage:
make_intl_data.py langtags [cldr_core.zip]
Target "langtags":
This script extracts information about 1) mappings between deprecated and
current Unicode BCP 47 locale identifiers, and 2) deprecated and current
BCP 47 Unicode extension value from CLDR.
"""
from __future__ import print_function
import os
import re
import io
import sys
from contextlib import closing
from functools import partial
from operator import itemgetter
from zipfile import ZipFile
if sys.version_info.major == 2:
from urllib2 import urlopen
else:
from urllib.request import urlopen
def read_supplemental_data(core_file):
""" Reads CLDR Supplemental Data and extracts information for Intl.js.
Information extracted:
- grandfatheredMappings: mappings from grandfathered tags to preferred
complete language tags
- languageMappings: mappings from language subtags to preferred subtags
- complexLanguageMappings: mappings from language subtags with complex rules
- regionMappings: mappings from region subtags to preferred subtags
- complexRegionMappings: mappings from region subtags with complex rules
- variantMappings: mappings from variant subtags to preferred subtags
Returns these mappings as dictionaries.
"""
import xml.etree.ElementTree as ET
# From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
re_unicode_language_id = re.compile(
r"""
^
# unicode_language_id = unicode_language_subtag
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
(?P<language>[a-z]{2,3}|[a-z]{5,8})
# (sep unicode_script_subtag)?
# unicode_script_subtag = alpha{4}
(?:-(?P<script>[a-z]{4}))?
# (sep unicode_region_subtag)?
# unicode_region_subtag = (alpha{2} | digit{3})
(?:-(?P<region>([a-z]{2}|[0-9]{3})))?
# (sep unicode_variant_subtag)*
# unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
(?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
$
""", re.IGNORECASE | re.VERBOSE)
re_unicode_language_subtag = re.compile(
r"""
^
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
([a-z]{2,3}|[a-z]{5,8})
$
""", re.IGNORECASE | re.VERBOSE)
re_unicode_region_subtag = re.compile(
r"""
^
# unicode_region_subtag = (alpha{2} | digit{3})
([a-z]{2}|[0-9]{3})
$
""", re.IGNORECASE | re.VERBOSE)
re_unicode_variant_subtag = re.compile(
r"""
^
# unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
([a-z0-9]{5,8}|(?:[0-9][a-z0-9]{3}))
$
""", re.IGNORECASE | re.VERBOSE)
# The fixed list of BCP 47 grandfathered language tags.
grandfathered_tags = (
"art-lojban",
"cel-gaulish",
"en-GB-oed",
"i-ami",
"i-bnn",
"i-default",
"i-enochian",
"i-hak",
"i-klingon",
"i-lux",
"i-mingo",
"i-navajo",
"i-pwn",
"i-tao",
"i-tay",
"i-tsu",
"no-bok",
"no-nyn",
"sgn-BE-FR",
"sgn-BE-NL",
"sgn-CH-DE",
"zh-guoyu",
"zh-hakka",
"zh-min",
"zh-min-nan",
"zh-xiang",
)
# The list of grandfathered tags which are valid Unicode BCP 47 locale identifiers.
unicode_bcp47_grandfathered_tags = {tag for tag in grandfathered_tags
if re_unicode_language_id.match(tag)}
# Dictionary of simple language subtag mappings, e.g. "in" -> "id".
language_mappings = {}
# Dictionary of complex language subtag mappings, modifying more than one
# subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
complex_language_mappings = {}
# Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
region_mappings = {}
# Dictionary of complex region subtag mappings, containing more than one
# replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]).
complex_region_mappings = {}
# Dictionary of aliased variant subtags to a tuple of preferred replacement
# type and replacement, e.g. "arevela" -> ("language", "hy") or
# "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
variant_mappings = {}
# Dictionary of grandfathered mappings to preferred values.
grandfathered_mappings = {}
# CLDR uses "_" as the separator for some elements. Replace it with "-".
def bcp47_id(cldr_id):
return cldr_id.replace("_", "-")
# CLDR uses the canonical case for most entries, but there are some
# exceptions, like:
# <languageAlias type="drw" replacement="fa_af" reason="deprecated"/>
# Therefore canonicalize all tags to be on the safe side.
def bcp47_canonical(language, script, region):
# Canonical case for language subtags is lower case.
# Canonical case for script subtags is title case.
# Canonical case for region subtags is upper case.
return (language.lower() if language else None,
script.title() if script else None,
region.upper() if region else None)
tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
for language_alias in tree.iterfind(".//languageAlias"):
type = bcp47_id(language_alias.get("type"))
replacement = bcp47_id(language_alias.get("replacement"))
# Handle grandfathered mappings first.
if type in unicode_bcp47_grandfathered_tags:
grandfathered_mappings[type] = replacement
continue
# We're only interested in language subtag matches, so ignore any
# entries which have additional subtags.
if re_unicode_language_subtag.match(type) is None:
continue
assert type.islower()
if re_unicode_language_subtag.match(replacement) is not None:
# Canonical case for language subtags is lower-case.
language_mappings[type] = replacement.lower()
else:
replacement_match = re_unicode_language_id.match(replacement)
assert replacement_match is not None, (
"{} invalid Unicode BCP 47 locale identifier".format(replacement))
assert replacement_match.group("variants") is None, (
"{}: unexpected variant subtags in {}".format(type, replacement))
complex_language_mappings[type] = bcp47_canonical(replacement_match.group("language"),
replacement_match.group("script"),
replacement_match.group("region"))
for territory_alias in tree.iterfind(".//territoryAlias"):
type = territory_alias.get("type")
replacement = territory_alias.get("replacement")
# We're only interested in region subtag matches, so ignore any entries
# which contain legacy formats, e.g. three letter region codes.
if re_unicode_region_subtag.match(type) is None:
continue
assert type.isupper() or type.isdigit()
if re_unicode_region_subtag.match(replacement) is not None:
# Canonical case for region subtags is upper-case.
region_mappings[type] = replacement.upper()
else:
# Canonical case for region subtags is upper-case.
replacements = [r.upper() for r in replacement.split(" ")]
assert all(
re_unicode_region_subtag.match(loc) is not None for loc in replacements
), "{} invalid region subtags".format(replacement)
complex_region_mappings[type] = replacements
for variant_alias in tree.iterfind(".//variantAlias"):
type = variant_alias.get("type")
replacement = variant_alias.get("replacement")
assert re_unicode_variant_subtag.match(type) is not None, (
"{} invalid variant subtag".format(type))
# Normalize the case, because some variants are in upper case.
type = type.lower()
# The replacement can be a language, a region, or a variant subtag.
# Language and region subtags are case normalized, variant subtags can
# be in any case.
if re_unicode_language_subtag.match(replacement) is not None and replacement.islower():
variant_mappings[type] = ("language", replacement)
elif re_unicode_region_subtag.match(replacement) is not None:
assert replacement.isupper() or replacement.isdigit(), (
"{} invalid variant subtag replacement".format(replacement))
variant_mappings[type] = ("region", replacement)
else:
assert re_unicode_variant_subtag.match(replacement) is not None, (
"{} invalid variant subtag replacement".format(replacement))
variant_mappings[type] = ("variant", replacement.lower())
tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
likely_subtags = {}
for likely_subtag in tree.iterfind(".//likelySubtag"):
from_tag = bcp47_id(likely_subtag.get("from"))
from_match = re_unicode_language_id.match(from_tag)
assert from_match is not None, (
"{} invalid Unicode BCP 47 locale identifier".format(from_tag))
assert from_match.group("variants") is None, (
"unexpected variant subtags in {}".format(from_tag))
to_tag = bcp47_id(likely_subtag.get("to"))
to_match = re_unicode_language_id.match(to_tag)
assert to_match is not None, (
"{} invalid Unicode BCP 47 locale identifier".format(to_tag))
assert to_match.group("variants") is None, (
"unexpected variant subtags in {}".format(to_tag))
from_canonical = bcp47_canonical(from_match.group("language"),
from_match.group("script"),
from_match.group("region"))
to_canonical = bcp47_canonical(to_match.group("language"),
to_match.group("script"),
to_match.group("region"))
likely_subtags[from_canonical] = to_canonical
complex_region_mappings_final = {}
for (deprecated_region, replacements) in complex_region_mappings.items():
# Find all likely subtag entries which don't already contain a region
# subtag and whose target region is in the list of replacement regions.
region_likely_subtags = [(from_language, from_script, to_region)
for ((from_language, from_script, from_region),
(_, _, to_region)) in likely_subtags.items()
if from_region is None and to_region in replacements]
# The first replacement entry is the default region.
default = replacements[0]
# Find all likely subtag entries whose region matches the default region.
default_replacements = {(language, script)
for (language, script, region) in region_likely_subtags
if region == default}
# And finally find those entries which don't use the default region.
# These are the entries we're actually interested in, because those need
# to be handled specially when selecting the correct preferred region.
non_default_replacements = [(language, script, region)
for (language, script, region) in region_likely_subtags
if (language, script) not in default_replacements]
# If there are no non-default replacements, we can handle the region as
# part of the simple region mapping.
if non_default_replacements:
complex_region_mappings_final[deprecated_region] = (default, non_default_replacements)
else:
region_mappings[deprecated_region] = default
return {"grandfatheredMappings": grandfathered_mappings,
"languageMappings": language_mappings,
"complexLanguageMappings": complex_language_mappings,
"regionMappings": region_mappings,
"complexRegionMappings": complex_region_mappings_final,
"variantMappings": variant_mappings,
}
def read_unicode_extensions(core_file):
import xml.etree.ElementTree as ET
# Match all xml-files in the BCP 47 directory.
bcp_file_re = re.compile(r"^common/bcp47/.+\.xml$")
# https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
#
# type = alphanum{3,8} (sep alphanum{3,8})* ;
type_re = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
# Mapping from Unicode extension types to dict of deprecated to
# preferred values.
mapping = {
# Unicode BCP 47 U Extension
"u": {},
# Unicode BCP 47 T Extension
"t": {},
}
def read_bcp47_file(file):
tree = ET.parse(file)
for keyword in tree.iterfind(".//keyword/key"):
extension = keyword.get("extension", "u")
assert extension == "u" or extension == "t", (
"unknown extension type: {}".format(extension))
extension_name = keyword.get("name")
for type in keyword.iterfind("type"):
# <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
#
# The key or type name used by Unicode locale extension with 'u' extension
# syntax or the 't' extensions syntax. When alias below is absent, this name
# can be also used with the old style "@key=type" syntax.
name = type.get("name")
# Ignore the special name:
# - <https://unicode.org/reports/tr35/#CODEPOINTS>
# - <https://unicode.org/reports/tr35/#REORDER_CODE>
# - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
# - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
# - <https://unicode.org/reports/tr35/#PRIVATE_USE>
if name in ("CODEPOINTS", "REORDER_CODE", "RG_KEY_VALUE", "SUBDIVISION_CODE",
"PRIVATE_USE"):
continue
# All other names should match the 'type' production.
assert type_re.match(name) is not None, (
"{} matches the 'type' production".format(name))
# <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
#
# The preferred value of the deprecated key, type or attribute element.
# When a key, type or attribute element is deprecated, this attribute is
# used for specifying a new canonical form if available.
preferred = type.get("preferred")
# <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
#
# The BCP 47 form is the canonical form, and recommended. Other aliases are
# included only for backwards compatibility.
alias = type.get("alias")
# <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
#
# Use the bcp47 data to replace keys, types, tfields, and tvalues by their
# canonical forms. See Section 3.6.4 U Extension Data Files) and Section
# 3.7.1 T Extension Data Files. The aliases are in the alias attribute
# value, while the canonical is in the name attribute value.
# 'preferred' contains the new preferred name, 'alias' the compatibility
# name, but then there's this entry where 'preferred' and 'alias' are the
# same. So which one to choose? Assume 'preferred' is the actual canonical
# name.
#
# <type name="islamicc"
# description="Civil (algorithmic) Arabic calendar"
# deprecated="true"
# preferred="islamic-civil"
# alias="islamic-civil"/>
if preferred is not None:
assert type_re.match(preferred), preferred
mapping[extension].setdefault(extension_name, {})[name] = preferred
if alias is not None:
for alias_name in alias.lower().split(" "):
# Ignore alias entries which don't match the 'type' production.
if type_re.match(alias_name) is None:
continue
# See comment above when 'alias' and 'preferred' are both present.
if (preferred is not None and
name in mapping[extension][extension_name]):
continue
# Skip over entries where 'name' and 'alias' are equal.
#
# <type name="pst8pdt"
# description="POSIX style time zone for US Pacific Time"
# alias="PST8PDT"
# since="1.8"/>
if name == alias_name:
continue
mapping[extension].setdefault(extension_name, {})[alias_name] = name
def read_supplemental_metadata(file):
# Find subdivision and region replacements.
#
# <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
#
# Replace aliases in special key values:
# - If there is an 'sd' or 'rg' key, replace any subdivision alias
# in its value in the same way, using subdivisionAlias data.
tree = ET.parse(file)
for alias in tree.iterfind(".//subdivisionAlias"):
type = alias.get("type")
assert type_re.match(type) is not None, (
"{} matches the 'type' production".format(type))
# Take the first replacement when multiple ones are present.
replacement = alias.get("replacement").split(" ")[0].lower()
# Skip over invalid replacements.
#
# <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/>
#
# It's not entirely clear to me if CLDR actually wants to use
# "axzzzz" as the replacement for this case.
if type_re.match(replacement) is None:
continue
# 'subdivisionAlias' applies to 'rg' and 'sd' keys.
mapping["u"].setdefault("rg", {})[type] = replacement
mapping["u"].setdefault("sd", {})[type] = replacement
for name in core_file.namelist():
if bcp_file_re.match(name):
read_bcp47_file(core_file.open(name))
read_supplemental_metadata(core_file.open("common/supplemental/supplementalMetadata.xml"))
return {
"unicodeMappings": mapping["u"],
"transformMappings": mapping["t"],
}
def write_simple_mappings(println, name, mappings):
println(u"var {} = {{".format(name))
for (key, value) in sorted(mappings.items(), key=itemgetter(0)):
println(u""" "{}": "{}",""".format(key, value))
println(u"};")
def write_complex_language_mappings(println, mappings):
println(u"var __complexLanguageMappings = {")
def maybe_subtag(name, subtag):
if subtag is None:
return u""
return u""", {}: "{}\"""".format(name, subtag)
for (deprecated_language, (language, script, region)) in (
sorted(mappings.items(), key=itemgetter(0))
):
println(u""" "{}": {{language: "{}"{}{}}},""".format(deprecated_language, language,
maybe_subtag("script", script),
maybe_subtag("region", region)))
println(u"};")
def write_complex_region_mappings(println, mappings):
println(u"var __complexRegionMappings = {")
def maybe_subtag(name, subtag):
if subtag is None:
return u""
return u"""{}: "{}", """.format(name, subtag)
for (deprecated_region, (default, non_default_replacements)) in (
sorted(mappings.items(), key=itemgetter(0))
):
println(u""" "{}": {{""".format(deprecated_region))
println(u""" default: "{}",""".format(default))
for (language, script, region) in sorted(non_default_replacements, key=itemgetter(0, 1)):
mapping_key = language
if script is not None:
mapping_key += "-" + script
println(u""" "{}": "{}",""".format(mapping_key, region))
println(u" },")
println(u"};")
def write_variant_mappings(println, mappings):
println(u"var __variantMappings = {")
for (deprecated_variant, (type, replacement)) in sorted(mappings.items(), key=itemgetter(0)):
println(u""" "{}": {{type: "{}", replacement: "{}"}},""".format(deprecated_variant, type,
replacement))
println(u"};")
def write_unicode_extension_mappings(println, mapping, extension):
println(u"var __{}Mappings = {{".format(extension))
for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
println(u""" "{}": {{""".format(key))
for (type, replacement) in sorted(replacements.items(), key=itemgetter(0)):
println(u""" "{}": "{}",""".format(type, replacement))
println(u" },")
println(u"};")
def write_cldr_language_tag_data(println, data, url):
language_mappings = data["languageMappings"]
complex_language_mappings = data["complexLanguageMappings"]
region_mappings = data["regionMappings"]
complex_region_mappings = data["complexRegionMappings"]
variant_mappings = data["variantMappings"]
unicode_mappings = data["unicodeMappings"]
transform_mappings = data["transformMappings"]
write_simple_mappings(println, "__languageMappings", language_mappings)
write_simple_mappings(println, "__regionMappings", region_mappings)
write_complex_language_mappings(println, complex_language_mappings)
write_complex_region_mappings(println, complex_region_mappings)
write_variant_mappings(println, variant_mappings)
write_unicode_extension_mappings(println, unicode_mappings, "unicode")
write_unicode_extension_mappings(println, transform_mappings, "transform")
def update_cldr_lang_tags(args):
""" Generate the language tag mapping objects. """
version = args.version
url = args.url
out = args.out
filename = args.file
url = url.replace("<VERSION>", version)
print("Arguments:")
print("\tCLDR version: %s" % version)
print("\tDownload url: %s" % url)
if filename is not None:
print("\tLocal CLDR core.zip file: %s" % filename)
print("\tOutput file: %s" % out)
print("")
data = {
"version": version,
}
def read_files(cldr_file):
with ZipFile(cldr_file) as zip_file:
data.update(read_supplemental_data(zip_file))
data.update(read_unicode_extensions(zip_file))
print("Processing CLDR data...")
if filename is not None:
print("Always make sure you have the newest CLDR core.zip!")
with open(filename, "rb") as cldr_file:
read_files(cldr_file)
else:
print("Downloading CLDR core.zip...")
with closing(urlopen(url)) as cldr_file:
cldr_data = io.BytesIO(cldr_file.read())
read_files(cldr_data)
print("Writing Intl data...")
if out == "stdout":
out = sys.stdout.fileno()
with io.open(out, mode="w", encoding="utf-8", newline="") as f:
println = partial(print, file=f)
write_cldr_language_tag_data(println, data, url)
if __name__ == "__main__":
import argparse
def ensure_https(v):
if not v.startswith("https:"):
raise argparse.ArgumentTypeError("URL protocol must be https: " % v)
return v
parser = argparse.ArgumentParser(description="Update CLDR language tags data.")
parser.add_argument("--version",
metavar="VERSION",
required=True,
help="CLDR version number")
parser.add_argument("--url",
metavar="URL",
default="https://unicode.org/Public/cldr/<VERSION>/core.zip",
type=ensure_https,
help="Download url CLDR data (default: %(default)s)")
parser.add_argument("--out",
default="stdout",
help="Output file (default: %(default)s)")
parser.add_argument("file",
nargs="?",
help="Local cldr-core.zip file, if omitted uses <URL>")
parser.set_defaults(func=update_cldr_lang_tags)
args = parser.parse_args()
args.func(args)