~aleteoryx/muditaos

19a897cf653e0f70d1ceb5396c8870ac1eab4ac1 — mkamonMdt 5 years ago d3e51fb
[EGD-4604] Fix incorrect message UCS2 decode

The problem could be noticed in a incoming message that contained
emojis. All emojis we pushed to the front of a message, due to
separation on decoding of `uint32_t` chars 'larger' than `0xffff`
3 files changed, 44 insertions(+), 23 deletions(-)

M enabled_unittests
M module-utils/test/unittest_ucs2.cpp
M module-utils/ucs2/UCS2.cpp
M enabled_unittests => enabled_unittests +2 -0
@@ 288,12 288,14 @@ TESTS_LIST["catch2-utils-ucs2"]="
    UCS2 to UTF8 conversion;
    UCS2 from UTF8 emoji 😁;
    UCS2 from UTF8 emoji 🍣;
    UCS2 text with emojis int the middle from UTF8 code;
    UTF8 to UCS2 conversion;
    TEST special input characters from UTF8;
    TEST special input characters from std::string;
    UTF8 emoji 🍣 from UCS2 code;
    UTF8 emoji 😁 and text ęą from UCS2 code;
    UTF8 emoji 😁 and text abc from UCS2 code;
    UTF8 text with emojis int the middle from UCS2 code;
    UCS2 to UTF8 long string conversion;
    UTF8 to UCS2 long string conversion;
"

M module-utils/test/unittest_ucs2.cpp => module-utils/test/unittest_ucs2.cpp +23 -0
@@ 47,6 47,18 @@ TEST_CASE("UCS2 from UTF8 emoji 🍣")
    REQUIRE(ucs2.str() == str);
}

TEST_CASE("UCS2 text with emojis int the middle from UTF8 code")
{
    UTF8 utf8("ęą😁ęą🍣ęą");
    UCS2 ucs2 = UCS2(utf8);
    std::string expected("01190105" //ęą
                         "D83DDE01" // 😁
                         "01190105" // ęą
                         "D83CDF63" // 🍣
                         "01190105");
    REQUIRE(ucs2.str() == expected);
}

TEST_CASE("UTF8 to UCS2 conversion")
{
    UTF8 utf8("Test");


@@ 91,6 103,17 @@ TEST_CASE("UTF8 emoji 😁 and text abc from UCS2 code")
    REQUIRE(ucs2.toUTF8() == utf8);
}

TEST_CASE("UTF8 text with emojis int the middle from UCS2 code")
{
    UCS2 ucs2(std::string("01190105" //ęą
                          "D83DDE01" // 😁
                          "01190105" // ęą
                          "D83CDF63" // 🍣
                          "01190105"));
    UTF8 utf8("ęą😁ęą🍣ęą");
    REQUIRE(ucs2.toUTF8() == utf8);
}

TEST_CASE("UCS2 to UTF8 long string conversion")
{
    const char *messageRawBody = "004C006F00720065006D00200069007000730075006D00200064006F006C006F007200200073006900740"

M module-utils/ucs2/UCS2.cpp => module-utils/ucs2/UCS2.cpp +19 -23
@@ 1,4 1,4 @@
// Copyright (c) 2017-2020, Mudita Sp. z.o.o. All rights reserved.
// Copyright (c) 2017-2021, Mudita Sp. z.o.o. All rights reserved.
// For licensing, see https://github.com/mudita/MuditaOS/LICENSE.md

#include "UCS2.hpp"


@@ 59,20 59,17 @@ UCS2::UCS2(UCS2 &ucs)

UTF8 UCS2::toUTF8() const noexcept
{
    if (length == 0)
    if (length == 0) {
        return UTF8();
    }

    // create buffer for worst case scenario which is that every char will take 3 bytes in utf8 string
    // + 1 for null terminator

    const auto bufferSize = 3 * length + 1;
    auto buffer           = std::make_unique<uint8_t[]>(bufferSize);

    uint32_t offset = 0;
    std::string s{};
    constexpr auto maxDecodedCharSize = 3;
    std::array<uint8_t, maxDecodedCharSize> localBuffer;
    std::string decodeResult;
    decodeResult.reserve(maxDecodedCharSize * length + 1);
    for (uint32_t i = 0; i < length; i++) {
        uint32_t c = this->buffer[i];

        uint32_t offset  = 0;
        const uint32_t c = this->buffer[i];
        if (c > 0xffff) {
            // 32 bit conversion
            // U' = yyyyyyyyyyxxxxxxxxxx  // U - 0x10000


@@ 81,25 78,24 @@ UTF8 UCS2::toUTF8() const noexcept
            const uint16_t y       = (c & 0x03FF0000) >> 16;
            const uint16_t x       = c & 0x03FF;
            const uint32_t decoded = 0x10000 + (y << 10) + x;
            std::u32string u32s    = {decoded};

            s.append(convertToUtf8String(u32s));
            decodeResult.append(convertToUtf8String({decoded}));
            continue;
        }
        else if (c > 0x07ff) {
            buffer[offset++] = (0x00E0 | ((c & 0xF000) >> 12));
            buffer[offset++] = (0x0080 | ((c & 0x0FC0) >> 6));
            buffer[offset++] = (0x0080 | (c & 0x003F));
            localBuffer[offset++] = (0x00E0 | ((c & 0xF000) >> 12));
            localBuffer[offset++] = (0x0080 | ((c & 0x0FC0) >> 6));
            localBuffer[offset++] = (0x0080 | (c & 0x003F));
        }
        else if (c > 0x07f) {
            buffer[offset++] = (0x00C0 | ((c & 0x07C0) >> 6));
            buffer[offset++] = (0x0080 | (c & 0x003F));
            localBuffer[offset++] = (0x00C0 | ((c & 0x07C0) >> 6));
            localBuffer[offset++] = (0x0080 | (c & 0x003F));
        }
        else {
            buffer[offset++] = c;
            localBuffer[offset++] = c;
        }
        decodeResult.append(std::begin(localBuffer), std::next(std::begin(localBuffer), offset));
    }
    return (!s.empty()) ? UTF8(s + reinterpret_cast<const char *>(buffer.get()))
                        : UTF8(reinterpret_cast<const char *>(buffer.get()));
    return UTF8(decodeResult.c_str());
}

void UCS2::append(const uint32_t &ucs2char)