From 9576738bfaecb35ff9e6aa933e7efd4bbf3b0a98 Mon Sep 17 00:00:00 2001
From: ZyX <kp-pav@yandex.ru>
Date: Thu, 4 Dec 2014 22:34:00 +0300
Subject: [PATCH] When parsing JSON join surrogate pairs

Also closes #1211
---
 powerline/lib/unicode.py             |  6 ++++++
 powerline/lint/markedjson/scanner.py | 29 +++++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/powerline/lib/unicode.py b/powerline/lib/unicode.py
index 32ea3afe..ae8bf8f2 100644
--- a/powerline/lib/unicode.py
+++ b/powerline/lib/unicode.py
@@ -128,3 +128,9 @@ def string(s):
 		return s.encode('utf-8')
 	else:
 		return s
+
+
+def surrogate_pair_to_character(high, low):
+	'''Transform a pair of surrogate codepoints to one codepoint
+	'''
+	return 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00)
diff --git a/powerline/lint/markedjson/scanner.py b/powerline/lint/markedjson/scanner.py
index 543d7298..b0bddf38 100644
--- a/powerline/lint/markedjson/scanner.py
+++ b/powerline/lint/markedjson/scanner.py
@@ -1,9 +1,14 @@
 # vim:fileencoding=utf-8:noet
 from __future__ import (unicode_literals, division, absolute_import, print_function)
 
+from string import hexdigits
+
 from powerline.lint.markedjson.error import MarkedError
 from powerline.lint.markedjson import tokens
-from powerline.lib.unicode import unicode
+from powerline.lib.unicode import unicode, unichr, surrogate_pair_to_character
+
+
+hexdigits_set = set(hexdigits)
 
 
 # Scanner produces tokens of the following types:
@@ -415,7 +420,7 @@ class Scanner:
 					length = self.ESCAPE_CODES[ch]
 					self.forward()
 					for k in range(length):
-						if self.peek(k) not in '0123456789ABCDEFabcdef':
+						if self.peek(k) not in hexdigits:
 							raise ScannerError(
 								'while scanning a double-quoted scalar', start_mark,
 								'expected escape sequence of %d hexdecimal numbers, but found %r' % (
@@ -423,8 +428,26 @@ class Scanner:
 								self.get_mark()
 							)
 					code = int(self.prefix(length), 16)
-					chunks.append(chr(code))
 					self.forward(length)
+					if 0xD800 <= code <= 0xDC00:
+						# Start of the surrogate pair
+						next_char = self.prefix(6)
+						if (
+							next_char[0] != '\\'
+							or next_char[1] != 'u'
+							or not (set(next_char[2:]) < hexdigits_set)
+							or not (0xDC00 <= int(next_char[2:], 16) <= 0xDFFF)
+						):
+							raise ScannerError(
+								'while scanning a double-quoted scalar', start_mark,
+								'expected escape sequence with the next character in surrogate pair, but found %r' % (
+									next_char
+								),
+								self.get_mark()
+							)
+						code = surrogate_pair_to_character(code, int(next_char[2:], 16))
+						self.forward(6)
+					chunks.append(unichr(code))
 				else:
 					raise ScannerError(
 						'while scanning a double-quoted scalar', start_mark,