Add support for UCS-2 Python versions

Fixes #1213
This commit is contained in:
ZyX 2014-12-04 20:23:37 +03:00
parent 61f0542946
commit 209d6be91e
2 changed files with 96 additions and 21 deletions

View File

@ -7,7 +7,25 @@ import re
from powerline.lib.unicode import unichr
NON_PRINTABLE = re.compile('[^\t\n\x20-\x7E' + unichr(0x85) + (unichr(0xA0) + '-' + unichr(0xD7FF)) + (unichr(0xE000) + '-' + unichr(0xFFFD)) + ']')
NON_PRINTABLE = re.compile(
'[^'
# ASCII control characters: 0x00-0x19
+ '\t\n' # Tab, newline: allowed ASCII control characters
+ '\x20-\x7E' # ASCII printable characters
# Unicode control characters: 0x7F-0x9F
+ '\u0085' # Allowed unicode control character: next line character
+ '\u00A0-\uD7FF'
# Surrogate escapes: 0xD800-0xDFFF
+ '\uE000-\uFFFD'
+ ']'
+ ((
# Paired surrogate escapes: allowed in UCS-2 builds as the only way to
# represent characters above 0xFFFF. Only paired variant is allowed.
'|[\uD800-\uDBFF][\uDC00-\uDFFF]'
) if sys.maxunicode < 0x10FFFF else (
''
))
)
def repl(s):

View File

@ -1,7 +1,9 @@
# vim:fileencoding=utf-8:noet
from __future__ import (unicode_literals, division, absolute_import, print_function)
import sys
import os
import re
from unicodedata import east_asian_width, combining
from itertools import chain
@ -13,6 +15,80 @@ from powerline.lib.unicode import unichr
NBSP = ' '
np_control_character_translations = dict((
# Control characters: ^@ … ^Y
(i1, '^' + unichr(i1 + 0x40)) for i1 in range(0x20)
))
'''Control character translations
Dictionary that maps characters in range 0x000x1F (inclusive) to strings
``'^@'``, ``'^A'`` and so on.
.. note: maps tab to ``^I`` and newline to ``^J``.
'''
np_invalid_character_translations = dict((
# Invalid unicode characters obtained using 'surrogateescape' error
# handler.
(i2, '<{0:02x}>'.format(i2 - 0xDC00)) for i2 in range(0xDC80, 0xDD00)
))
'''Invalid unicode character translations
When using ``surrogateescape`` encoding error handling method characters in
range 0x800xFF (inclusive) are transformed into unpaired surrogate escape
unicode codepoints 0xDC800xDD00. This dictionary maps such characters to
``<80>``, ``<81>``, and so on: in Python-3 they cannot be printed or
converted to UTF-8 because UTF-8 standard does not allow surrogate escape
characters, not even paired ones. Python-2 contains a bug that allows such
action, but printing them in any case makes no sense.
'''
# XXX: not using `r` because it makes no sense.
np_invalid_character_re = re.compile('(?<![\uD800-\uDBFF])[\uDC80-\uDD00]')
'''Regex that finds unpaired surrogate escape characters
Search is only limited to the ones obtained from ``surrogateescape`` error
handling method. This regex is only used for UCS-2 Python variants because
in this case characters above 0xFFFF are represented as surrogate escapes
characters and are thus subject to partial transformation if
``np_invalid_character_translations`` translation table is used.
'''
np_character_translations = np_control_character_translations.copy()
'''Dictionary that contains non-printable character translations
In UCS-4 versions of Python this is a union of
``np_invalid_character_translations`` and ``np_control_character_translations``
dictionaries. In UCS-2 for technical reasons ``np_invalid_character_re`` is used
instead and this dictionary only contains items from
``np_control_character_translations``.
'''
translate_np = (
(
lambda s: (
np_invalid_character_re.subn(
lambda match: (
np_invalid_character_translations[ord(match.group(0))]
), s
)[0].translate(np_character_translations)
)
) if sys.maxunicode < 0x10FFFF else (
lambda s: (
s.translate(np_character_translations)
)
)
)
'''Function that translates non-printable characters into printable strings
Is used to translate control characters and surrogate escape characters
obtained from ``surrogateescape`` encoding errors handling method into some
printable sequences. See documentation for
``np_invalid_character_translations`` and
``np_control_character_translations`` for more details.
'''
def construct_returned_value(rendered_highlighted, segments, width, output_raw, output_width):
if not (output_raw or output_width):
return rendered_highlighted
@ -75,25 +151,6 @@ class Renderer(object):
See documentation of ``unicode.translate`` for details.
'''
np_character_translations = dict(chain(
# Control characters: ^@ … ^Y
((i1, '^' + unichr(i1 + 0x40)) for i1 in range(0x20)),
# Invalid unicode characters obtained using 'surrogateescape' error
# handler.
((i2, '<{0:02x}>'.format(i2 - 0xDC00)) for i2 in range(0xDC80, 0xDD00)),
))
'''Non-printable character translations
These are used to transform characters in range 0x000x1F into ``^@``,
``^A`` and so on and characters in range 0xDC800xDCFF into ``<80>``,
``<81>`` and so on (latter are invalid characters obtained using
``surrogateescape`` error handling method used automatically in a number of
places in Python3). Unilke with ``.escape()`` method (and
``character_translations``) result is passed to ``.strwidth()`` method.
Note: transforms tab into ``^I``.
'''
def __init__(self,
theme_config,
local_themes,
@ -381,7 +438,7 @@ class Renderer(object):
contents_highlighted = ''
draw_divider = segment['draw_' + divider_type + '_divider']
contents_raw = contents_raw.translate(self.np_character_translations)
contents_raw = translate_np(contents_raw)
# XXX Make sure self.hl() calls are called in the same order
# segments are displayed. This is needed for Vim renderer to work.