mirror of https://github.com/tc39/test262.git
Add the language tag mappings python script
The file is derived from the same-named file for SpiderMonkey, therefore I've kept the MPL license info. The next commits use this script to generate language tag mappings data.
This commit is contained in:
parent
756ee6a171
commit
033f31a8ed
|
@ -0,0 +1,645 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# Copyright (C) 2020 Mozilla Corporation. All rights reserved.
|
||||||
|
#
|
||||||
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
#
|
||||||
|
# Original file:
|
||||||
|
# https://hg.mozilla.org/mozilla-central/file/tip/js/src/builtin/intl/make_intl_data.py
|
||||||
|
|
||||||
|
""" Usage:
|
||||||
|
make_intl_data.py langtags [cldr_core.zip]
|
||||||
|
|
||||||
|
|
||||||
|
Target "langtags":
|
||||||
|
This script extracts information about 1) mappings between deprecated and
|
||||||
|
current Unicode BCP 47 locale identifiers, and 2) deprecated and current
|
||||||
|
BCP 47 Unicode extension value from CLDR.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import io
|
||||||
|
import sys
|
||||||
|
from contextlib import closing
|
||||||
|
from functools import partial
|
||||||
|
from operator import itemgetter
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
if sys.version_info.major == 2:
|
||||||
|
from urllib2 import urlopen
|
||||||
|
else:
|
||||||
|
from urllib.request import urlopen
|
||||||
|
|
||||||
|
|
||||||
|
def read_supplemental_data(core_file):
|
||||||
|
""" Reads CLDR Supplemental Data and extracts information for Intl.js.
|
||||||
|
|
||||||
|
Information extracted:
|
||||||
|
- grandfatheredMappings: mappings from grandfathered tags to preferred
|
||||||
|
complete language tags
|
||||||
|
- languageMappings: mappings from language subtags to preferred subtags
|
||||||
|
- complexLanguageMappings: mappings from language subtags with complex rules
|
||||||
|
- regionMappings: mappings from region subtags to preferred subtags
|
||||||
|
- complexRegionMappings: mappings from region subtags with complex rules
|
||||||
|
- variantMappings: mappings from variant subtags to preferred subtags
|
||||||
|
Returns these mappings as dictionaries.
|
||||||
|
"""
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
# From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
|
||||||
|
re_unicode_language_id = re.compile(
|
||||||
|
r"""
|
||||||
|
^
|
||||||
|
# unicode_language_id = unicode_language_subtag
|
||||||
|
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
|
||||||
|
(?P<language>[a-z]{2,3}|[a-z]{5,8})
|
||||||
|
|
||||||
|
# (sep unicode_script_subtag)?
|
||||||
|
# unicode_script_subtag = alpha{4}
|
||||||
|
(?:-(?P<script>[a-z]{4}))?
|
||||||
|
|
||||||
|
# (sep unicode_region_subtag)?
|
||||||
|
# unicode_region_subtag = (alpha{2} | digit{3})
|
||||||
|
(?:-(?P<region>([a-z]{2}|[0-9]{3})))?
|
||||||
|
|
||||||
|
# (sep unicode_variant_subtag)*
|
||||||
|
# unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
|
||||||
|
(?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
|
||||||
|
$
|
||||||
|
""", re.IGNORECASE | re.VERBOSE)
|
||||||
|
|
||||||
|
re_unicode_language_subtag = re.compile(
|
||||||
|
r"""
|
||||||
|
^
|
||||||
|
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
|
||||||
|
([a-z]{2,3}|[a-z]{5,8})
|
||||||
|
$
|
||||||
|
""", re.IGNORECASE | re.VERBOSE)
|
||||||
|
|
||||||
|
re_unicode_region_subtag = re.compile(
|
||||||
|
r"""
|
||||||
|
^
|
||||||
|
# unicode_region_subtag = (alpha{2} | digit{3})
|
||||||
|
([a-z]{2}|[0-9]{3})
|
||||||
|
$
|
||||||
|
""", re.IGNORECASE | re.VERBOSE)
|
||||||
|
|
||||||
|
re_unicode_variant_subtag = re.compile(
|
||||||
|
r"""
|
||||||
|
^
|
||||||
|
# unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
|
||||||
|
([a-z0-9]{5,8}|(?:[0-9][a-z0-9]{3}))
|
||||||
|
$
|
||||||
|
""", re.IGNORECASE | re.VERBOSE)
|
||||||
|
|
||||||
|
# The fixed list of BCP 47 grandfathered language tags.
|
||||||
|
grandfathered_tags = (
|
||||||
|
"art-lojban",
|
||||||
|
"cel-gaulish",
|
||||||
|
"en-GB-oed",
|
||||||
|
"i-ami",
|
||||||
|
"i-bnn",
|
||||||
|
"i-default",
|
||||||
|
"i-enochian",
|
||||||
|
"i-hak",
|
||||||
|
"i-klingon",
|
||||||
|
"i-lux",
|
||||||
|
"i-mingo",
|
||||||
|
"i-navajo",
|
||||||
|
"i-pwn",
|
||||||
|
"i-tao",
|
||||||
|
"i-tay",
|
||||||
|
"i-tsu",
|
||||||
|
"no-bok",
|
||||||
|
"no-nyn",
|
||||||
|
"sgn-BE-FR",
|
||||||
|
"sgn-BE-NL",
|
||||||
|
"sgn-CH-DE",
|
||||||
|
"zh-guoyu",
|
||||||
|
"zh-hakka",
|
||||||
|
"zh-min",
|
||||||
|
"zh-min-nan",
|
||||||
|
"zh-xiang",
|
||||||
|
)
|
||||||
|
|
||||||
|
# The list of grandfathered tags which are valid Unicode BCP 47 locale identifiers.
|
||||||
|
unicode_bcp47_grandfathered_tags = {tag for tag in grandfathered_tags
|
||||||
|
if re_unicode_language_id.match(tag)}
|
||||||
|
|
||||||
|
# Dictionary of simple language subtag mappings, e.g. "in" -> "id".
|
||||||
|
language_mappings = {}
|
||||||
|
|
||||||
|
# Dictionary of complex language subtag mappings, modifying more than one
|
||||||
|
# subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
|
||||||
|
complex_language_mappings = {}
|
||||||
|
|
||||||
|
# Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
|
||||||
|
region_mappings = {}
|
||||||
|
|
||||||
|
# Dictionary of complex region subtag mappings, containing more than one
|
||||||
|
# replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]).
|
||||||
|
complex_region_mappings = {}
|
||||||
|
|
||||||
|
# Dictionary of aliased variant subtags to a tuple of preferred replacement
|
||||||
|
# type and replacement, e.g. "arevela" -> ("language", "hy") or
|
||||||
|
# "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
|
||||||
|
variant_mappings = {}
|
||||||
|
|
||||||
|
# Dictionary of grandfathered mappings to preferred values.
|
||||||
|
grandfathered_mappings = {}
|
||||||
|
|
||||||
|
# CLDR uses "_" as the separator for some elements. Replace it with "-".
|
||||||
|
def bcp47_id(cldr_id):
|
||||||
|
return cldr_id.replace("_", "-")
|
||||||
|
|
||||||
|
# CLDR uses the canonical case for most entries, but there are some
|
||||||
|
# exceptions, like:
|
||||||
|
# <languageAlias type="drw" replacement="fa_af" reason="deprecated"/>
|
||||||
|
# Therefore canonicalize all tags to be on the safe side.
|
||||||
|
def bcp47_canonical(language, script, region):
|
||||||
|
# Canonical case for language subtags is lower case.
|
||||||
|
# Canonical case for script subtags is title case.
|
||||||
|
# Canonical case for region subtags is upper case.
|
||||||
|
return (language.lower() if language else None,
|
||||||
|
script.title() if script else None,
|
||||||
|
region.upper() if region else None)
|
||||||
|
|
||||||
|
tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
|
||||||
|
|
||||||
|
for language_alias in tree.iterfind(".//languageAlias"):
|
||||||
|
type = bcp47_id(language_alias.get("type"))
|
||||||
|
replacement = bcp47_id(language_alias.get("replacement"))
|
||||||
|
|
||||||
|
# Handle grandfathered mappings first.
|
||||||
|
if type in unicode_bcp47_grandfathered_tags:
|
||||||
|
grandfathered_mappings[type] = replacement
|
||||||
|
continue
|
||||||
|
|
||||||
|
# We're only interested in language subtag matches, so ignore any
|
||||||
|
# entries which have additional subtags.
|
||||||
|
if re_unicode_language_subtag.match(type) is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
assert type.islower()
|
||||||
|
|
||||||
|
if re_unicode_language_subtag.match(replacement) is not None:
|
||||||
|
# Canonical case for language subtags is lower-case.
|
||||||
|
language_mappings[type] = replacement.lower()
|
||||||
|
else:
|
||||||
|
replacement_match = re_unicode_language_id.match(replacement)
|
||||||
|
assert replacement_match is not None, (
|
||||||
|
"{} invalid Unicode BCP 47 locale identifier".format(replacement))
|
||||||
|
assert replacement_match.group("variants") is None, (
|
||||||
|
"{}: unexpected variant subtags in {}".format(type, replacement))
|
||||||
|
|
||||||
|
complex_language_mappings[type] = bcp47_canonical(replacement_match.group("language"),
|
||||||
|
replacement_match.group("script"),
|
||||||
|
replacement_match.group("region"))
|
||||||
|
|
||||||
|
for territory_alias in tree.iterfind(".//territoryAlias"):
|
||||||
|
type = territory_alias.get("type")
|
||||||
|
replacement = territory_alias.get("replacement")
|
||||||
|
|
||||||
|
# We're only interested in region subtag matches, so ignore any entries
|
||||||
|
# which contain legacy formats, e.g. three letter region codes.
|
||||||
|
if re_unicode_region_subtag.match(type) is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
assert type.isupper() or type.isdigit()
|
||||||
|
|
||||||
|
if re_unicode_region_subtag.match(replacement) is not None:
|
||||||
|
# Canonical case for region subtags is upper-case.
|
||||||
|
region_mappings[type] = replacement.upper()
|
||||||
|
else:
|
||||||
|
# Canonical case for region subtags is upper-case.
|
||||||
|
replacements = [r.upper() for r in replacement.split(" ")]
|
||||||
|
assert all(
|
||||||
|
re_unicode_region_subtag.match(loc) is not None for loc in replacements
|
||||||
|
), "{} invalid region subtags".format(replacement)
|
||||||
|
complex_region_mappings[type] = replacements
|
||||||
|
|
||||||
|
for variant_alias in tree.iterfind(".//variantAlias"):
|
||||||
|
type = variant_alias.get("type")
|
||||||
|
replacement = variant_alias.get("replacement")
|
||||||
|
|
||||||
|
assert re_unicode_variant_subtag.match(type) is not None, (
|
||||||
|
"{} invalid variant subtag".format(type))
|
||||||
|
|
||||||
|
# Normalize the case, because some variants are in upper case.
|
||||||
|
type = type.lower()
|
||||||
|
|
||||||
|
# The replacement can be a language, a region, or a variant subtag.
|
||||||
|
# Language and region subtags are case normalized, variant subtags can
|
||||||
|
# be in any case.
|
||||||
|
|
||||||
|
if re_unicode_language_subtag.match(replacement) is not None and replacement.islower():
|
||||||
|
variant_mappings[type] = ("language", replacement)
|
||||||
|
|
||||||
|
elif re_unicode_region_subtag.match(replacement) is not None:
|
||||||
|
assert replacement.isupper() or replacement.isdigit(), (
|
||||||
|
"{} invalid variant subtag replacement".format(replacement))
|
||||||
|
variant_mappings[type] = ("region", replacement)
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert re_unicode_variant_subtag.match(replacement) is not None, (
|
||||||
|
"{} invalid variant subtag replacement".format(replacement))
|
||||||
|
variant_mappings[type] = ("variant", replacement.lower())
|
||||||
|
|
||||||
|
tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
|
||||||
|
|
||||||
|
likely_subtags = {}
|
||||||
|
|
||||||
|
for likely_subtag in tree.iterfind(".//likelySubtag"):
|
||||||
|
from_tag = bcp47_id(likely_subtag.get("from"))
|
||||||
|
from_match = re_unicode_language_id.match(from_tag)
|
||||||
|
assert from_match is not None, (
|
||||||
|
"{} invalid Unicode BCP 47 locale identifier".format(from_tag))
|
||||||
|
assert from_match.group("variants") is None, (
|
||||||
|
"unexpected variant subtags in {}".format(from_tag))
|
||||||
|
|
||||||
|
to_tag = bcp47_id(likely_subtag.get("to"))
|
||||||
|
to_match = re_unicode_language_id.match(to_tag)
|
||||||
|
assert to_match is not None, (
|
||||||
|
"{} invalid Unicode BCP 47 locale identifier".format(to_tag))
|
||||||
|
assert to_match.group("variants") is None, (
|
||||||
|
"unexpected variant subtags in {}".format(to_tag))
|
||||||
|
|
||||||
|
from_canonical = bcp47_canonical(from_match.group("language"),
|
||||||
|
from_match.group("script"),
|
||||||
|
from_match.group("region"))
|
||||||
|
|
||||||
|
to_canonical = bcp47_canonical(to_match.group("language"),
|
||||||
|
to_match.group("script"),
|
||||||
|
to_match.group("region"))
|
||||||
|
|
||||||
|
likely_subtags[from_canonical] = to_canonical
|
||||||
|
|
||||||
|
complex_region_mappings_final = {}
|
||||||
|
|
||||||
|
for (deprecated_region, replacements) in complex_region_mappings.items():
|
||||||
|
# Find all likely subtag entries which don't already contain a region
|
||||||
|
# subtag and whose target region is in the list of replacement regions.
|
||||||
|
region_likely_subtags = [(from_language, from_script, to_region)
|
||||||
|
for ((from_language, from_script, from_region),
|
||||||
|
(_, _, to_region)) in likely_subtags.items()
|
||||||
|
if from_region is None and to_region in replacements]
|
||||||
|
|
||||||
|
# The first replacement entry is the default region.
|
||||||
|
default = replacements[0]
|
||||||
|
|
||||||
|
# Find all likely subtag entries whose region matches the default region.
|
||||||
|
default_replacements = {(language, script)
|
||||||
|
for (language, script, region) in region_likely_subtags
|
||||||
|
if region == default}
|
||||||
|
|
||||||
|
# And finally find those entries which don't use the default region.
|
||||||
|
# These are the entries we're actually interested in, because those need
|
||||||
|
# to be handled specially when selecting the correct preferred region.
|
||||||
|
non_default_replacements = [(language, script, region)
|
||||||
|
for (language, script, region) in region_likely_subtags
|
||||||
|
if (language, script) not in default_replacements]
|
||||||
|
|
||||||
|
# If there are no non-default replacements, we can handle the region as
|
||||||
|
# part of the simple region mapping.
|
||||||
|
if non_default_replacements:
|
||||||
|
complex_region_mappings_final[deprecated_region] = (default, non_default_replacements)
|
||||||
|
else:
|
||||||
|
region_mappings[deprecated_region] = default
|
||||||
|
|
||||||
|
return {"grandfatheredMappings": grandfathered_mappings,
|
||||||
|
"languageMappings": language_mappings,
|
||||||
|
"complexLanguageMappings": complex_language_mappings,
|
||||||
|
"regionMappings": region_mappings,
|
||||||
|
"complexRegionMappings": complex_region_mappings_final,
|
||||||
|
"variantMappings": variant_mappings,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def read_unicode_extensions(core_file):
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
# Match all xml-files in the BCP 47 directory.
|
||||||
|
bcp_file_re = re.compile(r"^common/bcp47/.+\.xml$")
|
||||||
|
|
||||||
|
# https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
|
||||||
|
#
|
||||||
|
# type = alphanum{3,8} (sep alphanum{3,8})* ;
|
||||||
|
type_re = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
|
||||||
|
|
||||||
|
# Mapping from Unicode extension types to dict of deprecated to
|
||||||
|
# preferred values.
|
||||||
|
mapping = {
|
||||||
|
# Unicode BCP 47 U Extension
|
||||||
|
"u": {},
|
||||||
|
|
||||||
|
# Unicode BCP 47 T Extension
|
||||||
|
"t": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
def read_bcp47_file(file):
|
||||||
|
tree = ET.parse(file)
|
||||||
|
for keyword in tree.iterfind(".//keyword/key"):
|
||||||
|
extension = keyword.get("extension", "u")
|
||||||
|
assert extension == "u" or extension == "t", (
|
||||||
|
"unknown extension type: {}".format(extension))
|
||||||
|
|
||||||
|
extension_name = keyword.get("name")
|
||||||
|
|
||||||
|
for type in keyword.iterfind("type"):
|
||||||
|
# <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
|
||||||
|
#
|
||||||
|
# The key or type name used by Unicode locale extension with 'u' extension
|
||||||
|
# syntax or the 't' extensions syntax. When alias below is absent, this name
|
||||||
|
# can be also used with the old style "@key=type" syntax.
|
||||||
|
name = type.get("name")
|
||||||
|
|
||||||
|
# Ignore the special name:
|
||||||
|
# - <https://unicode.org/reports/tr35/#CODEPOINTS>
|
||||||
|
# - <https://unicode.org/reports/tr35/#REORDER_CODE>
|
||||||
|
# - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
|
||||||
|
# - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
|
||||||
|
# - <https://unicode.org/reports/tr35/#PRIVATE_USE>
|
||||||
|
if name in ("CODEPOINTS", "REORDER_CODE", "RG_KEY_VALUE", "SUBDIVISION_CODE",
|
||||||
|
"PRIVATE_USE"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# All other names should match the 'type' production.
|
||||||
|
assert type_re.match(name) is not None, (
|
||||||
|
"{} matches the 'type' production".format(name))
|
||||||
|
|
||||||
|
# <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
|
||||||
|
#
|
||||||
|
# The preferred value of the deprecated key, type or attribute element.
|
||||||
|
# When a key, type or attribute element is deprecated, this attribute is
|
||||||
|
# used for specifying a new canonical form if available.
|
||||||
|
preferred = type.get("preferred")
|
||||||
|
|
||||||
|
# <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
|
||||||
|
#
|
||||||
|
# The BCP 47 form is the canonical form, and recommended. Other aliases are
|
||||||
|
# included only for backwards compatibility.
|
||||||
|
alias = type.get("alias")
|
||||||
|
|
||||||
|
# <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
|
||||||
|
#
|
||||||
|
# Use the bcp47 data to replace keys, types, tfields, and tvalues by their
|
||||||
|
# canonical forms. See Section 3.6.4 U Extension Data Files) and Section
|
||||||
|
# 3.7.1 T Extension Data Files. The aliases are in the alias attribute
|
||||||
|
# value, while the canonical is in the name attribute value.
|
||||||
|
|
||||||
|
# 'preferred' contains the new preferred name, 'alias' the compatibility
|
||||||
|
# name, but then there's this entry where 'preferred' and 'alias' are the
|
||||||
|
# same. So which one to choose? Assume 'preferred' is the actual canonical
|
||||||
|
# name.
|
||||||
|
#
|
||||||
|
# <type name="islamicc"
|
||||||
|
# description="Civil (algorithmic) Arabic calendar"
|
||||||
|
# deprecated="true"
|
||||||
|
# preferred="islamic-civil"
|
||||||
|
# alias="islamic-civil"/>
|
||||||
|
|
||||||
|
if preferred is not None:
|
||||||
|
assert type_re.match(preferred), preferred
|
||||||
|
mapping[extension].setdefault(extension_name, {})[name] = preferred
|
||||||
|
|
||||||
|
if alias is not None:
|
||||||
|
for alias_name in alias.lower().split(" "):
|
||||||
|
# Ignore alias entries which don't match the 'type' production.
|
||||||
|
if type_re.match(alias_name) is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# See comment above when 'alias' and 'preferred' are both present.
|
||||||
|
if (preferred is not None and
|
||||||
|
name in mapping[extension][extension_name]):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip over entries where 'name' and 'alias' are equal.
|
||||||
|
#
|
||||||
|
# <type name="pst8pdt"
|
||||||
|
# description="POSIX style time zone for US Pacific Time"
|
||||||
|
# alias="PST8PDT"
|
||||||
|
# since="1.8"/>
|
||||||
|
if name == alias_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
mapping[extension].setdefault(extension_name, {})[alias_name] = name
|
||||||
|
|
||||||
|
def read_supplemental_metadata(file):
|
||||||
|
# Find subdivision and region replacements.
|
||||||
|
#
|
||||||
|
# <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
|
||||||
|
#
|
||||||
|
# Replace aliases in special key values:
|
||||||
|
# - If there is an 'sd' or 'rg' key, replace any subdivision alias
|
||||||
|
# in its value in the same way, using subdivisionAlias data.
|
||||||
|
tree = ET.parse(file)
|
||||||
|
for alias in tree.iterfind(".//subdivisionAlias"):
|
||||||
|
type = alias.get("type")
|
||||||
|
assert type_re.match(type) is not None, (
|
||||||
|
"{} matches the 'type' production".format(type))
|
||||||
|
|
||||||
|
# Take the first replacement when multiple ones are present.
|
||||||
|
replacement = alias.get("replacement").split(" ")[0].lower()
|
||||||
|
|
||||||
|
# Skip over invalid replacements.
|
||||||
|
#
|
||||||
|
# <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/>
|
||||||
|
#
|
||||||
|
# It's not entirely clear to me if CLDR actually wants to use
|
||||||
|
# "axzzzz" as the replacement for this case.
|
||||||
|
if type_re.match(replacement) is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 'subdivisionAlias' applies to 'rg' and 'sd' keys.
|
||||||
|
mapping["u"].setdefault("rg", {})[type] = replacement
|
||||||
|
mapping["u"].setdefault("sd", {})[type] = replacement
|
||||||
|
|
||||||
|
for name in core_file.namelist():
|
||||||
|
if bcp_file_re.match(name):
|
||||||
|
read_bcp47_file(core_file.open(name))
|
||||||
|
|
||||||
|
read_supplemental_metadata(core_file.open("common/supplemental/supplementalMetadata.xml"))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"unicodeMappings": mapping["u"],
|
||||||
|
"transformMappings": mapping["t"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def write_simple_mappings(println, name, mappings):
|
||||||
|
println(u"var {} = {{".format(name))
|
||||||
|
|
||||||
|
for (key, value) in sorted(mappings.items(), key=itemgetter(0)):
|
||||||
|
println(u""" "{}": "{}",""".format(key, value))
|
||||||
|
|
||||||
|
println(u"};")
|
||||||
|
|
||||||
|
|
||||||
|
def write_complex_language_mappings(println, mappings):
|
||||||
|
println(u"var __complexLanguageMappings = {")
|
||||||
|
|
||||||
|
def maybe_subtag(name, subtag):
|
||||||
|
if subtag is None:
|
||||||
|
return u""
|
||||||
|
return u""", {}: "{}\"""".format(name, subtag)
|
||||||
|
|
||||||
|
for (deprecated_language, (language, script, region)) in (
|
||||||
|
sorted(mappings.items(), key=itemgetter(0))
|
||||||
|
):
|
||||||
|
println(u""" "{}": {{language: "{}"{}{}}},""".format(deprecated_language, language,
|
||||||
|
maybe_subtag("script", script),
|
||||||
|
maybe_subtag("region", region)))
|
||||||
|
|
||||||
|
println(u"};")
|
||||||
|
|
||||||
|
|
||||||
|
def write_complex_region_mappings(println, mappings):
|
||||||
|
println(u"var __complexRegionMappings = {")
|
||||||
|
|
||||||
|
def maybe_subtag(name, subtag):
|
||||||
|
if subtag is None:
|
||||||
|
return u""
|
||||||
|
return u"""{}: "{}", """.format(name, subtag)
|
||||||
|
|
||||||
|
for (deprecated_region, (default, non_default_replacements)) in (
|
||||||
|
sorted(mappings.items(), key=itemgetter(0))
|
||||||
|
):
|
||||||
|
println(u""" "{}": {{""".format(deprecated_region))
|
||||||
|
println(u""" default: "{}",""".format(default))
|
||||||
|
|
||||||
|
for (language, script, region) in sorted(non_default_replacements, key=itemgetter(0, 1)):
|
||||||
|
mapping_key = language
|
||||||
|
if script is not None:
|
||||||
|
mapping_key += "-" + script
|
||||||
|
|
||||||
|
println(u""" "{}": "{}",""".format(mapping_key, region))
|
||||||
|
|
||||||
|
println(u" },")
|
||||||
|
|
||||||
|
println(u"};")
|
||||||
|
|
||||||
|
|
||||||
|
def write_variant_mappings(println, mappings):
|
||||||
|
println(u"var __variantMappings = {")
|
||||||
|
|
||||||
|
for (deprecated_variant, (type, replacement)) in sorted(mappings.items(), key=itemgetter(0)):
|
||||||
|
println(u""" "{}": {{type: "{}", replacement: "{}"}},""".format(deprecated_variant, type,
|
||||||
|
replacement))
|
||||||
|
|
||||||
|
println(u"};")
|
||||||
|
|
||||||
|
|
||||||
|
def write_unicode_extension_mappings(println, mapping, extension):
|
||||||
|
println(u"var __{}Mappings = {{".format(extension))
|
||||||
|
|
||||||
|
for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
|
||||||
|
println(u""" "{}": {{""".format(key))
|
||||||
|
|
||||||
|
for (type, replacement) in sorted(replacements.items(), key=itemgetter(0)):
|
||||||
|
println(u""" "{}": "{}",""".format(type, replacement))
|
||||||
|
println(u" },")
|
||||||
|
|
||||||
|
println(u"};")
|
||||||
|
|
||||||
|
|
||||||
|
def write_cldr_language_tag_data(println, data, url):
|
||||||
|
language_mappings = data["languageMappings"]
|
||||||
|
complex_language_mappings = data["complexLanguageMappings"]
|
||||||
|
region_mappings = data["regionMappings"]
|
||||||
|
complex_region_mappings = data["complexRegionMappings"]
|
||||||
|
variant_mappings = data["variantMappings"]
|
||||||
|
unicode_mappings = data["unicodeMappings"]
|
||||||
|
transform_mappings = data["transformMappings"]
|
||||||
|
|
||||||
|
write_simple_mappings(println, "__languageMappings", language_mappings)
|
||||||
|
write_simple_mappings(println, "__regionMappings", region_mappings)
|
||||||
|
|
||||||
|
write_complex_language_mappings(println, complex_language_mappings)
|
||||||
|
write_complex_region_mappings(println, complex_region_mappings)
|
||||||
|
|
||||||
|
write_variant_mappings(println, variant_mappings)
|
||||||
|
|
||||||
|
write_unicode_extension_mappings(println, unicode_mappings, "unicode")
|
||||||
|
write_unicode_extension_mappings(println, transform_mappings, "transform")
|
||||||
|
|
||||||
|
|
||||||
|
def update_cldr_lang_tags(args):
|
||||||
|
""" Generate the language tag mapping objects. """
|
||||||
|
version = args.version
|
||||||
|
url = args.url
|
||||||
|
out = args.out
|
||||||
|
filename = args.file
|
||||||
|
|
||||||
|
url = url.replace("<VERSION>", version)
|
||||||
|
|
||||||
|
print("Arguments:")
|
||||||
|
print("\tCLDR version: %s" % version)
|
||||||
|
print("\tDownload url: %s" % url)
|
||||||
|
if filename is not None:
|
||||||
|
print("\tLocal CLDR core.zip file: %s" % filename)
|
||||||
|
print("\tOutput file: %s" % out)
|
||||||
|
print("")
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"version": version,
|
||||||
|
}
|
||||||
|
|
||||||
|
def read_files(cldr_file):
|
||||||
|
with ZipFile(cldr_file) as zip_file:
|
||||||
|
data.update(read_supplemental_data(zip_file))
|
||||||
|
data.update(read_unicode_extensions(zip_file))
|
||||||
|
|
||||||
|
print("Processing CLDR data...")
|
||||||
|
if filename is not None:
|
||||||
|
print("Always make sure you have the newest CLDR core.zip!")
|
||||||
|
with open(filename, "rb") as cldr_file:
|
||||||
|
read_files(cldr_file)
|
||||||
|
else:
|
||||||
|
print("Downloading CLDR core.zip...")
|
||||||
|
with closing(urlopen(url)) as cldr_file:
|
||||||
|
cldr_data = io.BytesIO(cldr_file.read())
|
||||||
|
read_files(cldr_data)
|
||||||
|
|
||||||
|
print("Writing Intl data...")
|
||||||
|
if out == "stdout":
|
||||||
|
out = sys.stdout.fileno()
|
||||||
|
with io.open(out, mode="w", encoding="utf-8", newline="") as f:
|
||||||
|
println = partial(print, file=f)
|
||||||
|
|
||||||
|
write_cldr_language_tag_data(println, data, url)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def ensure_https(v):
|
||||||
|
if not v.startswith("https:"):
|
||||||
|
raise argparse.ArgumentTypeError("URL protocol must be https: " % v)
|
||||||
|
return v
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Update CLDR language tags data.")
|
||||||
|
|
||||||
|
parser.add_argument("--version",
|
||||||
|
metavar="VERSION",
|
||||||
|
required=True,
|
||||||
|
help="CLDR version number")
|
||||||
|
parser.add_argument("--url",
|
||||||
|
metavar="URL",
|
||||||
|
default="https://unicode.org/Public/cldr/<VERSION>/core.zip",
|
||||||
|
type=ensure_https,
|
||||||
|
help="Download url CLDR data (default: %(default)s)")
|
||||||
|
parser.add_argument("--out",
|
||||||
|
default="stdout",
|
||||||
|
help="Output file (default: %(default)s)")
|
||||||
|
parser.add_argument("file",
|
||||||
|
nargs="?",
|
||||||
|
help="Local cldr-core.zip file, if omitted uses <URL>")
|
||||||
|
parser.set_defaults(func=update_cldr_lang_tags)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.func(args)
|
Loading…
Reference in New Issue