//===-- Unittests for StringConverter class -------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "hdr/errno_macros.h" #include "hdr/types/char32_t.h" #include "hdr/types/char8_t.h" #include "src/__support/error_or.h" #include "src/__support/macros/properties/os.h" #include "src/__support/wchar/mbstate.h" #include "src/__support/wchar/string_converter.h" #include "test/UnitTest/Test.h" // TODO: add support for 16-bit widechars to StringConverter to remove this // macro #ifdef LIBC_TARGET_OS_IS_WINDOWS TEST(LlvmLibcStringConverterTest, Windows) { // pass on windows for now } #else TEST(LlvmLibcStringConverterTest, UTF8To32) { // first 4 bytes are clown emoji (🤡) // next 3 bytes are sigma symbol (∑) // next 2 bytes are y with diaeresis (ÿ) // last byte is the letter A const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91\xC3\xBF\x41"; LIBC_NAMESPACE::internal::mbstate state; LIBC_NAMESPACE::internal::StringConverter sc( reinterpret_cast(src), &state, SIZE_MAX); auto res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x1f921); ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x2211); ASSERT_EQ(static_cast(sc.getSourceIndex()), 7); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xff); ASSERT_EQ(static_cast(sc.getSourceIndex()), 9); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x41); ASSERT_EQ(static_cast(sc.getSourceIndex()), 10); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0); ASSERT_EQ(static_cast(sc.getSourceIndex()), 11); res = sc.pop(); ASSERT_FALSE(res.has_value()); ASSERT_EQ(res.error(), -1); ASSERT_EQ(static_cast(sc.getSourceIndex()), 11); } TEST(LlvmLibcStringConverterTest, UTF32To8) { // clown emoji, sigma symbol, y with diaeresis, letter A const wchar_t src[] = {static_cast(0x1f921), static_cast(0x2211), static_cast(0xff), static_cast(0x41), static_cast(0x0)}; LIBC_NAMESPACE::internal::mbstate state; LIBC_NAMESPACE::internal::StringConverter sc( reinterpret_cast(src), &state, SIZE_MAX); auto res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xF0); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x9F); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xA4); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xA1); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); // end of clown emoji, sigma symbol begins res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xE2); ASSERT_EQ(static_cast(sc.getSourceIndex()), 2); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x88); ASSERT_EQ(static_cast(sc.getSourceIndex()), 2); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x91); ASSERT_EQ(static_cast(sc.getSourceIndex()), 2); // end of sigma symbol, y with diaeresis begins res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xC3); ASSERT_EQ(static_cast(sc.getSourceIndex()), 3); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xBF); ASSERT_EQ(static_cast(sc.getSourceIndex()), 3); // end of y with diaeresis, letter A begins res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x41); ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); // null byte res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0); ASSERT_EQ(static_cast(sc.getSourceIndex()), 5); res = sc.pop(); ASSERT_FALSE(res.has_value()); ASSERT_EQ(res.error(), -1); ASSERT_EQ(static_cast(sc.getSourceIndex()), 5); } TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) { const wchar_t src[] = { static_cast(0x1f921), static_cast(0x2211), static_cast(0x0)}; // clown emoji, sigma symbol LIBC_NAMESPACE::internal::mbstate state; LIBC_NAMESPACE::internal::StringConverter sc( reinterpret_cast(src), &state, SIZE_MAX, 1); auto res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xF0); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x9F); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xA4); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xA1); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); // can only read 1 character from source string, so error on next pop res = sc.pop(); ASSERT_FALSE(res.has_value()); ASSERT_EQ(res.error(), -1); } TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) { // first 4 bytes are clown emoji, then next 3 are sigma symbol const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91"; LIBC_NAMESPACE::internal::mbstate state; LIBC_NAMESPACE::internal::StringConverter sc( reinterpret_cast(src), &state, SIZE_MAX, 5); auto res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x1f921); ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); res = sc.pop(); ASSERT_FALSE(res.has_value()); ASSERT_EQ(static_cast(res.error()), -1); ASSERT_EQ(static_cast(sc.getSourceIndex()), 5); } TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) { const wchar_t src[] = { static_cast(0x1f921), static_cast(0xffffff), static_cast(0x0)}; // clown emoji, invalid utf32 LIBC_NAMESPACE::internal::mbstate state; LIBC_NAMESPACE::internal::StringConverter sc( reinterpret_cast(src), &state, SIZE_MAX); auto res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xF0); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x9F); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xA4); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xA1); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_FALSE(res.has_value()); ASSERT_EQ(static_cast(res.error()), EILSEQ); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); } TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) { // first 4 bytes are clown emoji (🤡) // next 3 form an invalid character const char *src = "\xF0\x9F\xA4\xA1\x90\x88\x30"; LIBC_NAMESPACE::internal::mbstate state; LIBC_NAMESPACE::internal::StringConverter sc( reinterpret_cast(src), &state, SIZE_MAX); auto res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x1f921); ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); res = sc.pop(); ASSERT_FALSE(res.has_value()); ASSERT_EQ(static_cast(res.error()), EILSEQ); ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); } TEST(LlvmLibcStringConverterTest, InvalidCharacterOutsideBounds) { // if an invalid character exists in the source string but we don't have space // to write it, we should return a "stop converting" error rather than an // invalid character error // first 4 bytes are clown emoji (🤡) // next 3 form an invalid character const char *src1 = "\xF0\x9F\xA4\xA1\x90\x88\x30"; LIBC_NAMESPACE::internal::mbstate ps1; LIBC_NAMESPACE::internal::StringConverter sc1( reinterpret_cast(src1), &ps1, 1); auto res1 = sc1.pop(); ASSERT_TRUE(res1.has_value()); ASSERT_EQ(static_cast(res1.value()), 0x1f921); ASSERT_EQ(static_cast(sc1.getSourceIndex()), 4); res1 = sc1.pop(); ASSERT_FALSE(res1.has_value()); // no space to write error NOT invalid character error (EILSEQ) ASSERT_EQ(static_cast(res1.error()), -1); ASSERT_EQ(static_cast(sc1.getSourceIndex()), 4); const wchar_t src2[] = { static_cast(0x1f921), static_cast(0xffffff), static_cast(0x0)}; // clown emoji, invalid utf32 LIBC_NAMESPACE::internal::mbstate ps2; LIBC_NAMESPACE::internal::StringConverter sc2( reinterpret_cast(src2), &ps2, 4); auto res2 = sc2.pop(); ASSERT_TRUE(res2.has_value()); ASSERT_EQ(static_cast(res2.value()), 0xF0); ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); res2 = sc2.pop(); ASSERT_TRUE(res2.has_value()); ASSERT_EQ(static_cast(res2.value()), 0x9F); ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); res2 = sc2.pop(); ASSERT_TRUE(res2.has_value()); ASSERT_EQ(static_cast(res2.value()), 0xA4); ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); res2 = sc2.pop(); ASSERT_TRUE(res2.has_value()); ASSERT_EQ(static_cast(res2.value()), 0xA1); ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); res2 = sc2.pop(); ASSERT_FALSE(res2.has_value()); // no space to write error NOT invalid character error (EILSEQ) ASSERT_EQ(static_cast(res2.error()), -1); ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); } TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) { /* We do NOT test partially popping a character and expecting the next StringConverter to continue where we left off. This is not expected to work and considered invalid. */ const wchar_t src[] = { static_cast(0x1f921), static_cast(0xff), static_cast(0x0)}; // clown emoji, y with diaeresis (ÿ) LIBC_NAMESPACE::internal::mbstate state; LIBC_NAMESPACE::internal::StringConverter sc1( reinterpret_cast(src), &state, SIZE_MAX, 1); auto res = sc1.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xF0); ASSERT_EQ(static_cast(sc1.getSourceIndex()), 1); res = sc1.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x9F); ASSERT_EQ(static_cast(sc1.getSourceIndex()), 1); res = sc1.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xA4); ASSERT_EQ(static_cast(sc1.getSourceIndex()), 1); res = sc1.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xA1); ASSERT_EQ(static_cast(sc1.getSourceIndex()), 1); // sc2 should pick up where sc1 left off and continue the conversion LIBC_NAMESPACE::internal::StringConverter sc2( reinterpret_cast(src) + sc1.getSourceIndex(), &state, SIZE_MAX, 1); res = sc2.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xC3); ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); res = sc2.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0xBF); ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); } TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) { const char *src = "\xF0\x9F\xA4\xA1"; // clown emoji LIBC_NAMESPACE::internal::mbstate state; LIBC_NAMESPACE::internal::StringConverter sc1( reinterpret_cast(src), &state, SIZE_MAX, 2); auto res = sc1.pop(); ASSERT_FALSE(res.has_value()); ASSERT_EQ(static_cast(res.error()), -1); ASSERT_EQ(static_cast(sc1.getSourceIndex()), 2); // sc2 should pick up where sc1 left off and continue the conversion LIBC_NAMESPACE::internal::StringConverter sc2( reinterpret_cast(src) + sc1.getSourceIndex(), &state, SIZE_MAX, 3); res = sc2.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0x1f921); ASSERT_EQ(static_cast(sc2.getSourceIndex()), 2); res = sc2.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(res.value()), 0); ASSERT_EQ(static_cast(sc2.getSourceIndex()), 3); } TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) { const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis LIBC_NAMESPACE::internal::mbstate state; LIBC_NAMESPACE::internal::StringConverter sc( reinterpret_cast(src), &state, 1); auto res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); res = sc.pop(); // no space to pop this into ASSERT_FALSE(res.has_value()); } TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) { const wchar_t src[] = {static_cast(0x1f921), static_cast(0x1f921)}; // 2 clown emojis LIBC_NAMESPACE::internal::mbstate state; LIBC_NAMESPACE::internal::StringConverter sc( reinterpret_cast(src), &state, 5); auto res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_TRUE(res.has_value()); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); res = sc.pop(); ASSERT_FALSE(res.has_value()); ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); } #endif