audk/BaseTools/Tests/CheckUnicodeSourceFiles.py

## @file
#  Unit tests for AutoGen.UniClassObject
#
#  Copyright (c) 2015, Intel Corporation. All rights reserved.<BR>
#
#  This program and the accompanying materials
#  are licensed and made available under the terms and conditions of the BSD License
#  which accompanies this distribution.  The full text of the license may be found at
#  http://opensource.org/licenses/bsd-license.php
#
#  THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
#  WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
#

##
# Import Modules
#
import os
import unittest

import codecs

import TestTools

from Common.Misc import PathClass
import AutoGen.UniClassObject as BtUni

from Common import EdkLogger
EdkLogger.InitializeForUnitTest()

class Tests(TestTools.BaseToolsTest):

    SampleData = u'''
        #langdef en-US "English"
        #string STR_A #language en-US "STR_A for en-US"
    '''

    def EncodeToFile(self, encoding, string=None):
        if string is None:
            string = self.SampleData
        if encoding is not None:
            data = codecs.encode(string, encoding)
        else:
            data = string
        path = 'input.uni'
        self.WriteTmpFile(path, data)
        return PathClass(self.GetTmpFilePath(path))

    def ErrorFailure(self, error, encoding, shouldPass):
        msg = error + ' should '
        if shouldPass:
            msg += 'not '
        msg += 'be generated for '
        msg += '%s data in a .uni file' % encoding
        self.fail(msg)

    def UnicodeErrorFailure(self, encoding, shouldPass):
        self.ErrorFailure('UnicodeError', encoding, shouldPass)

    def EdkErrorFailure(self, encoding, shouldPass):
        self.ErrorFailure('EdkLogger.FatalError', encoding, shouldPass)

    def CheckFile(self, encoding, shouldPass, string=None):
        path = self.EncodeToFile(encoding, string)
        try:
            BtUni.UniFileClassObject([path])
            if shouldPass:
                return
        except UnicodeError:
            if not shouldPass:
                return
            else:
                self.UnicodeErrorFailure(encoding, shouldPass)
        except EdkLogger.FatalError:
            if not shouldPass:
                return
            else:
                self.EdkErrorFailure(encoding, shouldPass)
        except Exception:
            pass

        self.EdkErrorFailure(encoding, shouldPass)

    def testUtf16InUniFile(self):
        self.CheckFile('utf_16', shouldPass=True)

    def testSupplementaryPlaneUnicodeCharInUtf16File(self):
        #
        # Supplementary Plane characters can exist in UTF-16 files,
        # but they are not valid UCS-2 characters.
        #
        # This test makes sure that BaseTools rejects these characters
        # if seen in a .uni file.
        #
        data = u'''
            #langdef en-US "English"
            #string STR_A #language en-US "CodePoint (\U00010300) > 0xFFFF"
        '''

        self.CheckFile('utf_16', shouldPass=False, string=data)

    def testSurrogatePairUnicodeCharInUtf16File(self):
        #
        # Surrogate Pair code points are used in UTF-16 files to
        # encode the Supplementary Plane characters. But, a Surrogate
        # Pair code point which is not followed by another Surrogate
        # Pair code point might be interpreted as a single code point
        # with the Surrogate Pair code point.
        #
        # This test makes sure that BaseTools rejects these characters
        # if seen in a .uni file.
        #
        data = codecs.BOM_UTF16_LE + '//\x01\xd8 '

        self.CheckFile(encoding=None, shouldPass=False, string=data)

    def testValidUtf8File(self):
        self.CheckFile(encoding='utf_8', shouldPass=True)

    def testValidUtf8FileWithBom(self):
        #
        # Same test as testValidUtf8File, but add the UTF-8 BOM
        #
        data = codecs.BOM_UTF8 + codecs.encode(self.SampleData, 'utf_8')

        self.CheckFile(encoding=None, shouldPass=True, string=data)

    def test32bitUnicodeCharInUtf8File(self):
        data = u'''
            #langdef en-US "English"
            #string STR_A #language en-US "CodePoint (\U00010300) > 0xFFFF"
        '''

        self.CheckFile('utf_16', shouldPass=False, string=data)

    def test32bitUnicodeCharInUtf8File(self):
        data = u'''
            #langdef en-US "English"
            #string STR_A #language en-US "CodePoint (\U00010300) > 0xFFFF"
        '''

        self.CheckFile('utf_8', shouldPass=False, string=data)

    def test32bitUnicodeCharInUtf8Comment(self):
        data = u'''
            // Even in comments, we reject non-UCS-2 chars: \U00010300
            #langdef en-US "English"
            #string STR_A #language en-US "A"
        '''

        self.CheckFile('utf_8', shouldPass=False, string=data)

    def testSurrogatePairUnicodeCharInUtf8File(self):
        #
        # Surrogate Pair code points are used in UTF-16 files to
        # encode the Supplementary Plane characters. In UTF-8, it is
        # trivial to encode these code points, but they are not valid
        # code points for characters, since they are reserved for the
        # UTF-16 Surrogate Pairs.
        #
        # This test makes sure that BaseTools rejects these characters
        # if seen in a .uni file.
        #
        data = '\xed\xa0\x81'

        self.CheckFile(encoding=None, shouldPass=False, string=data)

    def testSurrogatePairUnicodeCharInUtf8FileWithBom(self):
        #
        # Same test as testSurrogatePairUnicodeCharInUtf8File, but add
        # the UTF-8 BOM
        #
        data = codecs.BOM_UTF8 + '\xed\xa0\x81'

        self.CheckFile(encoding=None, shouldPass=False, string=data)

TheTestSuite = TestTools.MakeTheTestSuite(locals())

if __name__ == '__main__':
    allTests = TheTestSuite()
    unittest.TextTestRunner().run(allTests)
BaseTools/Tests: Add unit test for AutoGen.UniClassObject This verifies that a UTF-16 data (with BOM) .uni file is successfully read. Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17693 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:14 +02:00			`## @file`
			`# Unit tests for AutoGen.UniClassObject`
			`#`
			`# Copyright (c) 2015, Intel Corporation. All rights reserved.<BR>`
			`#`
			`# This program and the accompanying materials`
			`# are licensed and made available under the terms and conditions of the BSD License`
			`# which accompanies this distribution. The full text of the license may be found at`
			`# http://opensource.org/licenses/bsd-license.php`
			`#`
			`# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.`
			`#`

			`##`
			`# Import Modules`
			`#`
			`import os`
			`import unittest`

			`import codecs`

			`import TestTools`

			`from Common.Misc import PathClass`
			`import AutoGen.UniClassObject as BtUni`

			`from Common import EdkLogger`
			`EdkLogger.InitializeForUnitTest()`

			`class Tests(TestTools.BaseToolsTest):`

			`SampleData = u'''`
			`#langdef en-US "English"`
			`#string STR_A #language en-US "STR_A for en-US"`
			`'''`

			`def EncodeToFile(self, encoding, string=None):`
			`if string is None:`
			`string = self.SampleData`
BaseTools/Tests: Verify unsupported UTF-16 are rejected Supplementary Plane characters can exist in UTF-16 files, but they are not valid UCS-2 characters. For example, this python interpreter code: >>> import codecs >>> codecs.encode(u'\U00010300', 'utf-16') '\xff\xfe\x00\xd8\x00\xdf' Therefore the UCS-4 0x00010300 character is encoded as two 16-bit numbers (0xd800 0xdf00) in a little endian UTF-16 file. For more information, see: http://en.wikipedia.org/wiki/UTF-16#U.2B10000_to_U.2B10FFFF This test checks to make sure that BaseTools will reject these characters in UTF-16 files. The range of 0xd800 - 0xdfff should also be rejected as unicode code points because they are reserved for the surrogate pair usage in UTF-16 files. This test was fixed by the previous commit: "BaseTools/UniClassObject: Verify valid UCS-2 chars in UTF-16 .uni files" Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Laszlo Ersek <lersek@redhat.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17695 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:22 +02:00			`if encoding is not None:`
			`data = codecs.encode(string, encoding)`
			`else:`
			`data = string`
BaseTools/Tests: Add unit test for AutoGen.UniClassObject This verifies that a UTF-16 data (with BOM) .uni file is successfully read. Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17693 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:14 +02:00			`path = 'input.uni'`
			`self.WriteTmpFile(path, data)`
			`return PathClass(self.GetTmpFilePath(path))`

			`def ErrorFailure(self, error, encoding, shouldPass):`
			`msg = error + ' should '`
			`if shouldPass:`
			`msg += 'not '`
			`msg += 'be generated for '`
			`msg += '%s data in a .uni file' % encoding`
			`self.fail(msg)`

			`def UnicodeErrorFailure(self, encoding, shouldPass):`
			`self.ErrorFailure('UnicodeError', encoding, shouldPass)`

			`def EdkErrorFailure(self, encoding, shouldPass):`
			`self.ErrorFailure('EdkLogger.FatalError', encoding, shouldPass)`

			`def CheckFile(self, encoding, shouldPass, string=None):`
			`path = self.EncodeToFile(encoding, string)`
			`try:`
			`BtUni.UniFileClassObject([path])`
			`if shouldPass:`
			`return`
			`except UnicodeError:`
			`if not shouldPass:`
			`return`
			`else:`
			`self.UnicodeErrorFailure(encoding, shouldPass)`
			`except EdkLogger.FatalError:`
			`if not shouldPass:`
			`return`
			`else:`
			`self.EdkErrorFailure(encoding, shouldPass)`
			`except Exception:`
			`pass`

			`self.EdkErrorFailure(encoding, shouldPass)`

			`def testUtf16InUniFile(self):`
			`self.CheckFile('utf_16', shouldPass=True)`

BaseTools/Tests: Verify unsupported UTF-16 are rejected Supplementary Plane characters can exist in UTF-16 files, but they are not valid UCS-2 characters. For example, this python interpreter code: >>> import codecs >>> codecs.encode(u'\U00010300', 'utf-16') '\xff\xfe\x00\xd8\x00\xdf' Therefore the UCS-4 0x00010300 character is encoded as two 16-bit numbers (0xd800 0xdf00) in a little endian UTF-16 file. For more information, see: http://en.wikipedia.org/wiki/UTF-16#U.2B10000_to_U.2B10FFFF This test checks to make sure that BaseTools will reject these characters in UTF-16 files. The range of 0xd800 - 0xdfff should also be rejected as unicode code points because they are reserved for the surrogate pair usage in UTF-16 files. This test was fixed by the previous commit: "BaseTools/UniClassObject: Verify valid UCS-2 chars in UTF-16 .uni files" Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Laszlo Ersek <lersek@redhat.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17695 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:22 +02:00			`def testSupplementaryPlaneUnicodeCharInUtf16File(self):`
			`#`
			`# Supplementary Plane characters can exist in UTF-16 files,`
			`# but they are not valid UCS-2 characters.`
			`#`
			`# This test makes sure that BaseTools rejects these characters`
			`# if seen in a .uni file.`
			`#`
			`data = u'''`
			`#langdef en-US "English"`
			`#string STR_A #language en-US "CodePoint (\U00010300) > 0xFFFF"`
			`'''`

			`self.CheckFile('utf_16', shouldPass=False, string=data)`

			`def testSurrogatePairUnicodeCharInUtf16File(self):`
			`#`
			`# Surrogate Pair code points are used in UTF-16 files to`
			`# encode the Supplementary Plane characters. But, a Surrogate`
			`# Pair code point which is not followed by another Surrogate`
			`# Pair code point might be interpreted as a single code point`
			`# with the Surrogate Pair code point.`
			`#`
			`# This test makes sure that BaseTools rejects these characters`
			`# if seen in a .uni file.`
			`#`
Revert BaseTools: PYTHON3 migration This reverts commit 6693f359b3c213513c5096a06c6f67244a44dc52.. 678f85131238622e576705117e299d81cff755c9. Python3 migration is the fundamental change. It requires every developer to install Python3. Before this migration, the well communication and wide verification must be done. But now, most people is not aware of this change, and not try it. So, Python3 migration is reverted and be moved to edk2-staging Python3 branch for the edk2 user evaluation. Contributed-under: TianoCore Contribution Agreement 1.1 Signed-off-by: Liming Gao <liming.gao@intel.com> 2018-10-15 02:27:53 +02:00			`data = codecs.BOM_UTF16_LE + '//\x01\xd8 '`
BaseTools/Tests: Verify unsupported UTF-16 are rejected Supplementary Plane characters can exist in UTF-16 files, but they are not valid UCS-2 characters. For example, this python interpreter code: >>> import codecs >>> codecs.encode(u'\U00010300', 'utf-16') '\xff\xfe\x00\xd8\x00\xdf' Therefore the UCS-4 0x00010300 character is encoded as two 16-bit numbers (0xd800 0xdf00) in a little endian UTF-16 file. For more information, see: http://en.wikipedia.org/wiki/UTF-16#U.2B10000_to_U.2B10FFFF This test checks to make sure that BaseTools will reject these characters in UTF-16 files. The range of 0xd800 - 0xdfff should also be rejected as unicode code points because they are reserved for the surrogate pair usage in UTF-16 files. This test was fixed by the previous commit: "BaseTools/UniClassObject: Verify valid UCS-2 chars in UTF-16 .uni files" Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Laszlo Ersek <lersek@redhat.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17695 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:22 +02:00
			`self.CheckFile(encoding=None, shouldPass=False, string=data)`

BaseTools/Tests: Verify supported UTF-8 data is allowed We test a simple case of UTF-8 with and without the UTF-8 BOM. Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Laszlo Ersek <lersek@redhat.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17699 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:43 +02:00			`def testValidUtf8File(self):`
			`self.CheckFile(encoding='utf_8', shouldPass=True)`

			`def testValidUtf8FileWithBom(self):`
			`#`
			`# Same test as testValidUtf8File, but add the UTF-8 BOM`
			`#`
			`data = codecs.BOM_UTF8 + codecs.encode(self.SampleData, 'utf_8')`

			`self.CheckFile(encoding=None, shouldPass=True, string=data)`

BaseTools/Tests: Verify 32-bit UTF-8 chars are rejected Since UTF-8 .uni unicode files might contain strings with unicode code points larger than 16-bits, and UEFI only supports UCS-2 characters, we need to make sure that BaseTools rejects these characters in UTF-8 .uni source files. Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17697 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:33 +02:00			`def test32bitUnicodeCharInUtf8File(self):`
			`data = u'''`
			`#langdef en-US "English"`
			`#string STR_A #language en-US "CodePoint (\U00010300) > 0xFFFF"`
			`'''`

			`self.CheckFile('utf_16', shouldPass=False, string=data)`

			`def test32bitUnicodeCharInUtf8File(self):`
			`data = u'''`
			`#langdef en-US "English"`
			`#string STR_A #language en-US "CodePoint (\U00010300) > 0xFFFF"`
			`'''`

			`self.CheckFile('utf_8', shouldPass=False, string=data)`

			`def test32bitUnicodeCharInUtf8Comment(self):`
			`data = u'''`
			`// Even in comments, we reject non-UCS-2 chars: \U00010300`
			`#langdef en-US "English"`
			`#string STR_A #language en-US "A"`
			`'''`

			`self.CheckFile('utf_8', shouldPass=False, string=data)`

BaseTools/Tests: Verify unsupported UTF-8 data is rejected Surrogate pair characters can be encoded in UTF-8 files, but they are not valid UCS-2 characters. For example, this python interpreter code: >>> import codecs >>> codecs.encode(u'\ud801', 'utf-8') '\xed\xa0\x81' But, the range of 0xd800 - 0xdfff should be rejected as unicode code points because they are reserved for the surrogate pair usage in UTF-16 files. We test that this case is rejected for UTF-8 with and without the UTF-8 BOM. Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Laszlo Ersek <lersek@redhat.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17698 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:38 +02:00			`def testSurrogatePairUnicodeCharInUtf8File(self):`
			`#`
			`# Surrogate Pair code points are used in UTF-16 files to`
			`# encode the Supplementary Plane characters. In UTF-8, it is`
			`# trivial to encode these code points, but they are not valid`
			`# code points for characters, since they are reserved for the`
			`# UTF-16 Surrogate Pairs.`
			`#`
			`# This test makes sure that BaseTools rejects these characters`
			`# if seen in a .uni file.`
			`#`
Revert BaseTools: PYTHON3 migration This reverts commit 6693f359b3c213513c5096a06c6f67244a44dc52.. 678f85131238622e576705117e299d81cff755c9. Python3 migration is the fundamental change. It requires every developer to install Python3. Before this migration, the well communication and wide verification must be done. But now, most people is not aware of this change, and not try it. So, Python3 migration is reverted and be moved to edk2-staging Python3 branch for the edk2 user evaluation. Contributed-under: TianoCore Contribution Agreement 1.1 Signed-off-by: Liming Gao <liming.gao@intel.com> 2018-10-15 02:27:53 +02:00			`data = '\xed\xa0\x81'`
BaseTools/Tests: Verify unsupported UTF-8 data is rejected Surrogate pair characters can be encoded in UTF-8 files, but they are not valid UCS-2 characters. For example, this python interpreter code: >>> import codecs >>> codecs.encode(u'\ud801', 'utf-8') '\xed\xa0\x81' But, the range of 0xd800 - 0xdfff should be rejected as unicode code points because they are reserved for the surrogate pair usage in UTF-16 files. We test that this case is rejected for UTF-8 with and without the UTF-8 BOM. Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Laszlo Ersek <lersek@redhat.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17698 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:38 +02:00
			`self.CheckFile(encoding=None, shouldPass=False, string=data)`

			`def testSurrogatePairUnicodeCharInUtf8FileWithBom(self):`
			`#`
			`# Same test as testSurrogatePairUnicodeCharInUtf8File, but add`
			`# the UTF-8 BOM`
			`#`
Revert BaseTools: PYTHON3 migration This reverts commit 6693f359b3c213513c5096a06c6f67244a44dc52.. 678f85131238622e576705117e299d81cff755c9. Python3 migration is the fundamental change. It requires every developer to install Python3. Before this migration, the well communication and wide verification must be done. But now, most people is not aware of this change, and not try it. So, Python3 migration is reverted and be moved to edk2-staging Python3 branch for the edk2 user evaluation. Contributed-under: TianoCore Contribution Agreement 1.1 Signed-off-by: Liming Gao <liming.gao@intel.com> 2018-10-15 02:27:53 +02:00			`data = codecs.BOM_UTF8 + '\xed\xa0\x81'`
BaseTools/Tests: Verify unsupported UTF-8 data is rejected Surrogate pair characters can be encoded in UTF-8 files, but they are not valid UCS-2 characters. For example, this python interpreter code: >>> import codecs >>> codecs.encode(u'\ud801', 'utf-8') '\xed\xa0\x81' But, the range of 0xd800 - 0xdfff should be rejected as unicode code points because they are reserved for the surrogate pair usage in UTF-16 files. We test that this case is rejected for UTF-8 with and without the UTF-8 BOM. Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Laszlo Ersek <lersek@redhat.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17698 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:38 +02:00
			`self.CheckFile(encoding=None, shouldPass=False, string=data)`

BaseTools/Tests: Add unit test for AutoGen.UniClassObject This verifies that a UTF-16 data (with BOM) .uni file is successfully read. Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com> Reviewed-by: Yingke Liu <yingke.d.liu@intel.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@17693 6f19259b-4bc3-4df7-8a09-765794883524 2015-06-24 01:34:14 +02:00			`TheTestSuite = TestTools.MakeTheTestSuite(locals())`

			`if __name__ == '__main__':`
			`allTests = TheTestSuite()`
			`unittest.TextTestRunner().run(allTests)`