Move strwidth function variants to powerline.lib.unicode

They can be tested thus.
This commit is contained in:
ZyX 2014-12-04 23:56:40 +03:00
parent f3c8413043
commit 6dc585b7ee
2 changed files with 109 additions and 39 deletions

View File

@ -4,6 +4,8 @@ from __future__ import (unicode_literals, division, absolute_import, print_funct
import sys
import codecs
from unicodedata import east_asian_width, combining
from powerline.lib.encoding import get_preferred_output_encoding
@ -134,3 +136,97 @@ def surrogate_pair_to_character(high, low):
'''Transform a pair of surrogate codepoints to one codepoint
'''
return 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00)
_strwidth_documentation = (
'''Compute string width in display cells
{0}
:param dict width_data:
Dictionary which maps east_asian_width property values to strings
lengths. It is expected to contain the following keys and values (from
`East Asian Width annex <http://www.unicode.org/reports/tr11/>`_):
=== ====== ===========================================================
Key Value Description
=== ====== ===========================================================
F 2 Fullwidth: all characters that are defined as Fullwidth in
the Unicode Standard [Unicode] by having a compatibility
decomposition of type <wide> to characters elsewhere in the
Unicode Standard that are implicitly narrow but unmarked.
H 1 Halfwidth: all characters that are explicitly defined as
Halfwidth in the Unicode Standard by having a compatibility
decomposition of type <narrow> to characters elsewhere in
the Unicode Standard that are implicitly wide but unmarked,
plus U+20A9 WON SIGN.
W 2 Wide: all other characters that are always wide. These
characters occur only in the context of East Asian
typography where they are wide characters (such as the
Unified Han Ideographs or Squared Katakana Symbols). This
category includes characters that have explicit halfwidth
counterparts.
Na 1 Narrow: characters that are always narrow and have explicit
fullwidth or wide counterparts. These characters are
implicitly narrow in East Asian typography and legacy
character sets because they have explicit fullwidth or wide
counterparts. All of ASCII is an example of East Asian
Narrow characters.
A 1 or 2 Ambigious: characters that may sometimes be wide and
sometimes narrow. Ambiguous characters require additional
information not contained in the character code to further
resolve their width. This information is usually defined in
terminal setting that should in turn respect glyphs widths
in used fonts. Also see :ref:`ambiwidth configuration
option <config-common-ambiwidth>`.
N 1 Neutral characters: character that does not occur in legacy
East Asian character sets.
=== ====== ===========================================================
:param unicode string:
String whose width will be calculated.
:return: unsigned integer.''')
def strwidth_ucs_4(width_data, string):
return sum(((
(
0
) if combining(symbol) else (
width_data[east_asian_width(symbol)]
)
) for symbol in string))
strwidth_ucs_4.__doc__ = _strwidth_documentation.format(
'''This version of function expects that characters above 0xFFFF are
represented using one symbol. This is only the case in UCS-4 Python builds.
.. note:
Even in UCS-4 Python builds it is possible to represent characters above
0xFFFF using surrogate pairs. Characters represented this way are not
supported.''')
def strwidth_ucs_2(width_data, string):
return sum(((
(
width_data[
east_asian_width(
unichr(surrogate_pair_to_character(ord(string[i - 1]), ord(symbol)))
)
]
) if 0xDC00 <= ord(symbol) <= 0xDFFF else (
0
) if combining(symbol) or 0xD800 <= ord(symbol) <= 0xDBFF else (
width_data[east_asian_width(symbol)]
)
) for i, symbol in enumerate(string)))
strwidth_ucs_2.__doc__ = _strwidth_documentation.format(
'''This version of function expects that characters above 0xFFFF are
represented using two symbols forming a surrogate pair, which is the only
option in UCS-2 Python builds. It still works correctly in UCS-4 Python
builds, but is slower then its UCS-4 counterpart.''')

View File

@ -5,11 +5,10 @@ import sys
import os
import re
from unicodedata import east_asian_width, combining
from itertools import chain
from powerline.theme import Theme
from powerline.lib.unicode import unichr, surrogate_pair_to_character
from powerline.lib.unicode import unichr, strwidth_ucs_2, strwidth_ucs_4
NBSP = ' '
@ -177,7 +176,10 @@ class Renderer(object):
'F': 2, # Fullwidth
}
def strwidth(self, string):
strwidth = lambda self, s: (
(strwidth_ucs_2 if sys.maxunicode < 0x10FFFF else strwidth_ucs_4)(
self.width_data, s)
)
'''Function that returns string width.
Is used to calculate the place given string occupies when handling
@ -189,34 +191,6 @@ class Renderer(object):
:return: unsigned integer.
'''
return sum(((
(
0
) if combining(symbol) else (
self.width_data[east_asian_width(symbol)]
)
) for symbol in string))
if sys.maxunicode < 0x10FFFF:
old_strwidth = strwidth
def strwidth(self, string):
return sum(((
(
self.width_data[
east_asian_width(
unichr(surrogate_pair_to_character(ord(string[i - 1]), ord(symbol)))
)
]
) if 0xDC00 <= ord(symbol) <= 0xDFFF else (
0
) if combining(symbol) or 0xD800 <= ord(symbol) <= 0xDBFF else (
self.width_data[east_asian_width(symbol)]
)
) for i, symbol in enumerate(string)))
strwidth.__doc__ = old_strwidth.__doc__
del old_strwidth
def get_theme(self, matcher_info):
'''Get Theme object.