From 9576738bfaecb35ff9e6aa933e7efd4bbf3b0a98 Mon Sep 17 00:00:00 2001 From: ZyX Date: Thu, 4 Dec 2014 22:34:00 +0300 Subject: [PATCH] When parsing JSON join surrogate pairs Also closes #1211 --- powerline/lib/unicode.py | 6 ++++++ powerline/lint/markedjson/scanner.py | 29 +++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/powerline/lib/unicode.py b/powerline/lib/unicode.py index 32ea3afe..ae8bf8f2 100644 --- a/powerline/lib/unicode.py +++ b/powerline/lib/unicode.py @@ -128,3 +128,9 @@ def string(s): return s.encode('utf-8') else: return s + + +def surrogate_pair_to_character(high, low): + '''Transform a pair of surrogate codepoints to one codepoint + ''' + return 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00) diff --git a/powerline/lint/markedjson/scanner.py b/powerline/lint/markedjson/scanner.py index 543d7298..b0bddf38 100644 --- a/powerline/lint/markedjson/scanner.py +++ b/powerline/lint/markedjson/scanner.py @@ -1,9 +1,14 @@ # vim:fileencoding=utf-8:noet from __future__ import (unicode_literals, division, absolute_import, print_function) +from string import hexdigits + from powerline.lint.markedjson.error import MarkedError from powerline.lint.markedjson import tokens -from powerline.lib.unicode import unicode +from powerline.lib.unicode import unicode, unichr, surrogate_pair_to_character + + +hexdigits_set = set(hexdigits) # Scanner produces tokens of the following types: @@ -415,7 +420,7 @@ class Scanner: length = self.ESCAPE_CODES[ch] self.forward() for k in range(length): - if self.peek(k) not in '0123456789ABCDEFabcdef': + if self.peek(k) not in hexdigits: raise ScannerError( 'while scanning a double-quoted scalar', start_mark, 'expected escape sequence of %d hexdecimal numbers, but found %r' % ( @@ -423,8 +428,26 @@ class Scanner: self.get_mark() ) code = int(self.prefix(length), 16) - chunks.append(chr(code)) self.forward(length) + if 0xD800 <= code <= 0xDC00: + # Start of the surrogate pair + next_char = self.prefix(6) + if ( + next_char[0] != '\\' + or next_char[1] != 'u' + or not (set(next_char[2:]) < hexdigits_set) + or not (0xDC00 <= int(next_char[2:], 16) <= 0xDFFF) + ): + raise ScannerError( + 'while scanning a double-quoted scalar', start_mark, + 'expected escape sequence with the next character in surrogate pair, but found %r' % ( + next_char + ), + self.get_mark() + ) + code = surrogate_pair_to_character(code, int(next_char[2:], 16)) + self.forward(6) + chunks.append(unichr(code)) else: raise ScannerError( 'while scanning a double-quoted scalar', start_mark,