Move strwidth function variants to powerline.lib.unicode

They can be tested thus.
2025-09-26 03:19:07 +02:00 · 2014-12-04 23:56:40 +03:00 · 2014-12-04 23:56:40 +03:00 · 6dc585b7ee
commit 6dc585b7ee
parent f3c8413043
2 changed files with 109 additions and 39 deletions
--- a/powerline/lib/unicode.py
+++ b/powerline/lib/unicode.py
@ -4,6 +4,8 @@ from __future__ import (unicode_literals, division, absolute_import, print_funct
 import sys
 import codecs

+from unicodedata import east_asian_width, combining
+
 from powerline.lib.encoding import get_preferred_output_encoding


@ -134,3 +136,97 @@ def surrogate_pair_to_character(high, low):
 	'''Transform a pair of surrogate codepoints to one codepoint
 	'''
 	return 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00)
+
+
+_strwidth_documentation = (
+	'''Compute string width in display cells
+
+	{0}
+
+	:param dict width_data:
+		Dictionary which maps east_asian_width property values to strings 
+		lengths. It is expected to contain the following keys and values (from 
+		`East Asian Width annex <http://www.unicode.org/reports/tr11/>`_):
+
+		===  ======  ===========================================================
+		Key  Value   Description
+		===  ======  ===========================================================
+		F    2       Fullwidth: all characters that are defined as Fullwidth in 
+		             the Unicode Standard [Unicode] by having a compatibility 
+		             decomposition of type <wide> to characters elsewhere in the 
+		             Unicode Standard that are implicitly narrow but unmarked.
+		H    1       Halfwidth: all characters that are explicitly defined as 
+		             Halfwidth in the Unicode Standard by having a compatibility 
+		             decomposition of type <narrow> to characters elsewhere in 
+		             the Unicode Standard that are implicitly wide but unmarked, 
+		             plus U+20A9 ₩ WON SIGN.
+		W    2       Wide: all other characters that are always wide. These 
+		             characters occur only in the context of East Asian 
+		             typography where they are wide characters (such as the 
+		             Unified Han Ideographs or Squared Katakana Symbols). This 
+		             category includes characters that have explicit halfwidth 
+		             counterparts.
+		Na   1       Narrow: characters that are always narrow and have explicit 
+		             fullwidth or wide counterparts. These characters are 
+		             implicitly narrow in East Asian typography and legacy 
+		             character sets because they have explicit fullwidth or wide 
+		             counterparts. All of ASCII is an example of East Asian 
+		             Narrow characters.
+		A    1 or 2  Ambigious: characters that may sometimes be wide and 
+		             sometimes narrow. Ambiguous characters require additional 
+		             information not contained in the character code to further 
+		             resolve their width. This information is usually defined in 
+		             terminal setting that should in turn respect glyphs widths 
+		             in used fonts. Also see :ref:`ambiwidth configuration 
+		             option <config-common-ambiwidth>`.
+		N    1       Neutral characters: character that does not occur in legacy 
+		             East Asian character sets.
+		===  ======  ===========================================================
+
+	:param unicode string:
+		String whose width will be calculated.
+
+	:return: unsigned integer.''')
+
+
+def strwidth_ucs_4(width_data, string):
+	return sum(((
+		(
+			0
+		) if combining(symbol) else (
+			width_data[east_asian_width(symbol)]
+		)
+	) for symbol in string))
+
+
+strwidth_ucs_4.__doc__ = _strwidth_documentation.format(
+	'''This version of function expects that characters above 0xFFFF are 
+	represented using one symbol. This is only the case in UCS-4 Python builds.
+
+	.. note:
+		Even in UCS-4 Python builds it is possible to represent characters above 
+		0xFFFF using surrogate pairs. Characters represented this way are not 
+		supported.''')
+
+
+def strwidth_ucs_2(width_data, string):
+	return sum(((
+		(
+			width_data[
+				east_asian_width(
+					unichr(surrogate_pair_to_character(ord(string[i - 1]), ord(symbol)))
+				)
+			]
+		) if 0xDC00 <= ord(symbol) <= 0xDFFF else (
+			0
+		) if combining(symbol) or 0xD800 <= ord(symbol) <= 0xDBFF else (
+			width_data[east_asian_width(symbol)]
+		)
+	) for i, symbol in enumerate(string)))
+
+
+strwidth_ucs_2.__doc__ = _strwidth_documentation.format(
+	'''This version of function expects that characters above 0xFFFF are 
+	represented using two symbols forming a surrogate pair, which is the only 
+	option in UCS-2 Python builds. It still works correctly in UCS-4 Python 
+	builds, but is slower then its UCS-4 counterpart.''')
--- a/powerline/renderer.py
+++ b/powerline/renderer.py
@ -5,11 +5,10 @@ import sys
 import os
 import re

-from unicodedata import east_asian_width, combining
 from itertools import chain

 from powerline.theme import Theme
-from powerline.lib.unicode import unichr, surrogate_pair_to_character
+from powerline.lib.unicode import unichr, strwidth_ucs_2, strwidth_ucs_4


 NBSP = ' '
@ -177,7 +176,10 @@ class Renderer(object):
 			'F': 2,          # Fullwidth
 		}

-	def strwidth(self, string):
+	strwidth = lambda self, s: (
+		(strwidth_ucs_2 if sys.maxunicode < 0x10FFFF else strwidth_ucs_4)(
+			self.width_data, s)
+	)
 	'''Function that returns string width.

 	Is used to calculate the place given string occupies when handling 
@ -189,34 +191,6 @@ class Renderer(object):

 	:return: unsigned integer.
 	'''
-		return sum(((
-			(
-				0
-			) if combining(symbol) else (
-				self.width_data[east_asian_width(symbol)]
-			)
-		) for symbol in string))
-
-	if sys.maxunicode < 0x10FFFF:
-		old_strwidth = strwidth
-
-		def strwidth(self, string):
-			return sum(((
-				(
-					self.width_data[
-						east_asian_width(
-							unichr(surrogate_pair_to_character(ord(string[i - 1]), ord(symbol)))
-						)
-					]
-				) if 0xDC00 <= ord(symbol) <= 0xDFFF else (
-					0
-				) if combining(symbol) or 0xD800 <= ord(symbol) <= 0xDBFF else (
-					self.width_data[east_asian_width(symbol)]
-				)
-			) for i, symbol in enumerate(string)))
-
-		strwidth.__doc__ = old_strwidth.__doc__
-		del old_strwidth

 	def get_theme(self, matcher_info):
 		'''Get Theme object.