Add support for UCS-2 Python versions

Fixes #1213
2014-12-04 20:23:37 +03:00 · 2014-12-04 20:23:37 +03:00 · 209d6be91e
parent 61f0542946
commit 209d6be91e
2 changed files with 96 additions and 21 deletions
--- a/powerline/lint/markedjson/error.py
+++ b/powerline/lint/markedjson/error.py
@ -7,7 +7,25 @@ import re
 from powerline.lib.unicode import unichr


-NON_PRINTABLE = re.compile('[^\t\n\x20-\x7E' + unichr(0x85) + (unichr(0xA0) + '-' + unichr(0xD7FF)) + (unichr(0xE000) + '-' + unichr(0xFFFD)) + ']')
+NON_PRINTABLE = re.compile(
+	'[^'
+	# ASCII control characters: 0x00-0x19
+	+ '\t\n'           # Tab, newline: allowed ASCII control characters
+	+ '\x20-\x7E'      # ASCII printable characters
+	# Unicode control characters: 0x7F-0x9F
+	+ '\u0085'         # Allowed unicode control character: next line character
+	+ '\u00A0-\uD7FF'
+	# Surrogate escapes: 0xD800-0xDFFF
+	+ '\uE000-\uFFFD'
+	+ ']'
+	+ ((
+		# Paired surrogate escapes: allowed in UCS-2 builds as the only way to 
+		# represent characters above 0xFFFF. Only paired variant is allowed.
+		'|[\uD800-\uDBFF][\uDC00-\uDFFF]'
+	) if sys.maxunicode < 0x10FFFF else (
+		''
+	))
+)


 def repl(s):
--- a/powerline/renderer.py
+++ b/powerline/renderer.py
@ -1,7 +1,9 @@
 # vim:fileencoding=utf-8:noet
 from __future__ import (unicode_literals, division, absolute_import, print_function)

+import sys
 import os
+import re

 from unicodedata import east_asian_width, combining
 from itertools import chain
@ -13,6 +15,80 @@ from powerline.lib.unicode import unichr
 NBSP = ' '


+np_control_character_translations = dict((
+	# Control characters: ^@ … ^Y
+	(i1, '^' + unichr(i1 + 0x40)) for i1 in range(0x20)
+))
+'''Control character translations
+
+Dictionary that maps characters in range 0x00–0x1F (inclusive) to strings 
+``'^@'``, ``'^A'`` and so on.
+
+.. note: maps tab to ``^I`` and newline to ``^J``.
+'''
+
+np_invalid_character_translations = dict((
+	# Invalid unicode characters obtained using 'surrogateescape' error 
+	# handler.
+	(i2, '<{0:02x}>'.format(i2 - 0xDC00)) for i2 in range(0xDC80, 0xDD00)
+))
+'''Invalid unicode character translations
+
+When using ``surrogateescape`` encoding error handling method characters in 
+range 0x80–0xFF (inclusive) are transformed into unpaired surrogate escape 
+unicode codepoints 0xDC80–0xDD00. This dictionary maps such characters to 
+``<80>``, ``<81>``, and so on: in Python-3 they cannot be printed or 
+converted to UTF-8 because UTF-8 standard does not allow surrogate escape 
+characters, not even paired ones. Python-2 contains a bug that allows such 
+action, but printing them in any case makes no sense.
+'''
+
+# XXX: not using `r` because it makes no sense.
+np_invalid_character_re = re.compile('(?<![\uD800-\uDBFF])[\uDC80-\uDD00]')
+'''Regex that finds unpaired surrogate escape characters
+
+Search is only limited to the ones obtained from ``surrogateescape`` error 
+handling method. This regex is only used for UCS-2 Python variants because 
+in this case characters above 0xFFFF are represented as surrogate escapes 
+characters and are thus subject to partial transformation if 
+``np_invalid_character_translations`` translation table is used.
+'''
+
+np_character_translations = np_control_character_translations.copy()
+'''Dictionary that contains non-printable character translations
+
+In UCS-4 versions of Python this is a union of 
+``np_invalid_character_translations`` and ``np_control_character_translations`` 
+dictionaries. In UCS-2 for technical reasons ``np_invalid_character_re`` is used 
+instead and this dictionary only contains items from 
+``np_control_character_translations``.
+'''
+
+translate_np = (
+	(
+		lambda s: (
+			np_invalid_character_re.subn(
+				lambda match: (
+					np_invalid_character_translations[ord(match.group(0))]
+				), s
+			)[0].translate(np_character_translations)
+		)
+	) if sys.maxunicode < 0x10FFFF else (
+		lambda s: (
+			s.translate(np_character_translations)
+		)
+	)
+)
+'''Function that translates non-printable characters into printable strings
+
+Is used to translate control characters and surrogate escape characters 
+obtained from ``surrogateescape`` encoding errors handling method into some 
+printable sequences. See documentation for 
+``np_invalid_character_translations`` and 
+``np_control_character_translations`` for more details.
+'''
+
+
 def construct_returned_value(rendered_highlighted, segments, width, output_raw, output_width):
 	if not (output_raw or output_width):
 		return rendered_highlighted
@ -75,25 +151,6 @@ class Renderer(object):
 	See documentation of ``unicode.translate`` for details.
 	'''

-	np_character_translations = dict(chain(
-		# Control characters: ^@ … ^Y
-		((i1, '^' + unichr(i1 + 0x40)) for i1 in range(0x20)),
-		# Invalid unicode characters obtained using 'surrogateescape' error 
-		# handler.
-		((i2, '<{0:02x}>'.format(i2 - 0xDC00)) for i2 in range(0xDC80, 0xDD00)),
-	))
-	'''Non-printable character translations
-
-	These are used to transform characters in range 0x00—0x1F into ``^@``, 
-	``^A`` and so on and characters in range 0xDC80—0xDCFF into ``<80>``, 
-	``<81>`` and so on (latter are invalid characters obtained using 
-	``surrogateescape`` error handling method used automatically in a number of 
-	places in Python3). Unilke with ``.escape()`` method (and 
-	``character_translations``) result is passed to ``.strwidth()`` method.
-
-	Note: transforms tab into ``^I``.
-	'''
-
 	def __init__(self,
 	             theme_config,
 	             local_themes,
@ -381,7 +438,7 @@ class Renderer(object):
 			contents_highlighted = ''
 			draw_divider = segment['draw_' + divider_type + '_divider']

-			contents_raw = contents_raw.translate(self.np_character_translations)
+			contents_raw = translate_np(contents_raw)

 			# XXX Make sure self.hl() calls are called in the same order 
 			# segments are displayed. This is needed for Vim renderer to work.