From 19a897cf653e0f70d1ceb5396c8870ac1eab4ac1 Mon Sep 17 00:00:00 2001 From: mkamonMdt Date: Wed, 10 Feb 2021 10:44:25 +0100 Subject: [PATCH] [EGD-4604] Fix incorrect message UCS2 decode The problem could be noticed in a incoming message that contained emojis. All emojis we pushed to the front of a message, due to separation on decoding of `uint32_t` chars 'larger' than `0xffff` --- enabled_unittests | 2 ++ module-utils/test/unittest_ucs2.cpp | 23 ++++++++++++++++ module-utils/ucs2/UCS2.cpp | 42 +++++++++++++---------------- 3 files changed, 44 insertions(+), 23 deletions(-) diff --git a/enabled_unittests b/enabled_unittests index 997008e8edf1192218f0c901f74e658dad966c25..743cddeb64668713a528bd4211299aa4ef9c056d 100644 --- a/enabled_unittests +++ b/enabled_unittests @@ -288,12 +288,14 @@ TESTS_LIST["catch2-utils-ucs2"]=" UCS2 to UTF8 conversion; UCS2 from UTF8 emoji 😁; UCS2 from UTF8 emoji 🍣; + UCS2 text with emojis int the middle from UTF8 code; UTF8 to UCS2 conversion; TEST special input characters from UTF8; TEST special input characters from std::string; UTF8 emoji 🍣 from UCS2 code; UTF8 emoji 😁 and text ęą from UCS2 code; UTF8 emoji 😁 and text abc from UCS2 code; + UTF8 text with emojis int the middle from UCS2 code; UCS2 to UTF8 long string conversion; UTF8 to UCS2 long string conversion; " diff --git a/module-utils/test/unittest_ucs2.cpp b/module-utils/test/unittest_ucs2.cpp index ff094c8530964a73af00e84752061641851dbbb5..b4213803a63868988e370a5eb851440f014f83c1 100644 --- a/module-utils/test/unittest_ucs2.cpp +++ b/module-utils/test/unittest_ucs2.cpp @@ -47,6 +47,18 @@ TEST_CASE("UCS2 from UTF8 emoji 🍣") REQUIRE(ucs2.str() == str); } +TEST_CASE("UCS2 text with emojis int the middle from UTF8 code") +{ + UTF8 utf8("ęą😁ęą🍣ęą"); + UCS2 ucs2 = UCS2(utf8); + std::string expected("01190105" //ęą + "D83DDE01" // 😁 + "01190105" // ęą + "D83CDF63" // 🍣 + "01190105"); + REQUIRE(ucs2.str() == expected); +} + TEST_CASE("UTF8 to UCS2 conversion") { UTF8 utf8("Test"); @@ -91,6 +103,17 @@ TEST_CASE("UTF8 emoji 😁 and text abc from UCS2 code") REQUIRE(ucs2.toUTF8() == utf8); } +TEST_CASE("UTF8 text with emojis int the middle from UCS2 code") +{ + UCS2 ucs2(std::string("01190105" //ęą + "D83DDE01" // 😁 + "01190105" // ęą + "D83CDF63" // 🍣 + "01190105")); + UTF8 utf8("ęą😁ęą🍣ęą"); + REQUIRE(ucs2.toUTF8() == utf8); +} + TEST_CASE("UCS2 to UTF8 long string conversion") { const char *messageRawBody = "004C006F00720065006D00200069007000730075006D00200064006F006C006F007200200073006900740" diff --git a/module-utils/ucs2/UCS2.cpp b/module-utils/ucs2/UCS2.cpp index 6f52b32f1acbb1c509fb2d39a524e0cc11f87181..465d0dbc635e312a918f42ab681554f1963f1368 100644 --- a/module-utils/ucs2/UCS2.cpp +++ b/module-utils/ucs2/UCS2.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020, Mudita Sp. z.o.o. All rights reserved. +// Copyright (c) 2017-2021, Mudita Sp. z.o.o. All rights reserved. // For licensing, see https://github.com/mudita/MuditaOS/LICENSE.md #include "UCS2.hpp" @@ -59,20 +59,17 @@ UCS2::UCS2(UCS2 &ucs) UTF8 UCS2::toUTF8() const noexcept { - if (length == 0) + if (length == 0) { return UTF8(); + } - // create buffer for worst case scenario which is that every char will take 3 bytes in utf8 string - // + 1 for null terminator - - const auto bufferSize = 3 * length + 1; - auto buffer = std::make_unique(bufferSize); - - uint32_t offset = 0; - std::string s{}; + constexpr auto maxDecodedCharSize = 3; + std::array localBuffer; + std::string decodeResult; + decodeResult.reserve(maxDecodedCharSize * length + 1); for (uint32_t i = 0; i < length; i++) { - uint32_t c = this->buffer[i]; - + uint32_t offset = 0; + const uint32_t c = this->buffer[i]; if (c > 0xffff) { // 32 bit conversion // U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000 @@ -81,25 +78,24 @@ UTF8 UCS2::toUTF8() const noexcept const uint16_t y = (c & 0x03FF0000) >> 16; const uint16_t x = c & 0x03FF; const uint32_t decoded = 0x10000 + (y << 10) + x; - std::u32string u32s = {decoded}; - - s.append(convertToUtf8String(u32s)); + decodeResult.append(convertToUtf8String({decoded})); + continue; } else if (c > 0x07ff) { - buffer[offset++] = (0x00E0 | ((c & 0xF000) >> 12)); - buffer[offset++] = (0x0080 | ((c & 0x0FC0) >> 6)); - buffer[offset++] = (0x0080 | (c & 0x003F)); + localBuffer[offset++] = (0x00E0 | ((c & 0xF000) >> 12)); + localBuffer[offset++] = (0x0080 | ((c & 0x0FC0) >> 6)); + localBuffer[offset++] = (0x0080 | (c & 0x003F)); } else if (c > 0x07f) { - buffer[offset++] = (0x00C0 | ((c & 0x07C0) >> 6)); - buffer[offset++] = (0x0080 | (c & 0x003F)); + localBuffer[offset++] = (0x00C0 | ((c & 0x07C0) >> 6)); + localBuffer[offset++] = (0x0080 | (c & 0x003F)); } else { - buffer[offset++] = c; + localBuffer[offset++] = c; } + decodeResult.append(std::begin(localBuffer), std::next(std::begin(localBuffer), offset)); } - return (!s.empty()) ? UTF8(s + reinterpret_cast(buffer.get())) - : UTF8(reinterpret_cast(buffer.get())); + return UTF8(decodeResult.c_str()); } void UCS2::append(const uint32_t &ucs2char)