TGUI 1.11
Loading...
Searching...
No Matches
Utf.hpp
1
2//
3// TGUI - Texus' Graphical User Interface
4// Copyright (C) 2012-2025 Bruno Van de Velde (vdv_b@tgui.eu)
5//
6// This software is provided 'as-is', without any express or implied warranty.
7// In no event will the authors be held liable for any damages arising from the use of this software.
8//
9// Permission is granted to anyone to use this software for any purpose,
10// including commercial applications, and to alter it and redistribute it freely,
11// subject to the following restrictions:
12//
13// 1. The origin of this software must not be misrepresented;
14// you must not claim that you wrote the original software.
15// If you use this software in a product, an acknowledgment
16// in the product documentation would be appreciated but is not required.
17//
18// 2. Altered source versions must be plainly marked as such,
19// and must not be misrepresented as being the original software.
20//
21// 3. This notice may not be removed or altered from any source distribution.
22//
24
25#ifndef TGUI_UTF_HPP
26#define TGUI_UTF_HPP
27
28#include <TGUI/Config.hpp>
29
30#include <cstdint>
31#include <string>
32#include <array>
33
35
36namespace tgui
37{
38 namespace utf
39 {
45 template <typename CharT> // CharT is either char or char8_t
46 void encodeCharUtf8(char32_t input, std::basic_string<CharT>& outStrUtf8)
47 {
48 if (input < 128)
49 {
50 outStrUtf8.push_back(static_cast<CharT>(input));
51 return;
52 }
53
54 // Encode the character (if it is valid)
55 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
56 return;
57
58 // Get the number of bytes to write
59 std::size_t bytestoWrite;
60 std::uint8_t firstByteMask;
61 if (input < 0x800)
62 {
63 bytestoWrite = 2;
64 firstByteMask = 0xC0;
65 }
66 else if (input < 0x10000)
67 {
68 bytestoWrite = 3;
69 firstByteMask = 0xE0;
70 }
71 else
72 {
73 bytestoWrite = 4;
74 firstByteMask = 0xF0;
75 }
76
77 // Extract the bytes to write
78 std::array<CharT, 4> bytes;
79 if (bytestoWrite == 4) { bytes[3] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
80 if (bytestoWrite >= 3) { bytes[2] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
81 bytes[1] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6;
82 bytes[0] = static_cast<CharT>(input | firstByteMask);
83
84 // Add them to the output
85 outStrUtf8.append(bytes.begin(), bytes.begin() + static_cast<std::ptrdiff_t>(bytestoWrite));
86 }
87
95 template <typename CharIt> // CharIt is an iterator for a string containing either char or char8_t
96 CharIt decodeCharUtf8(CharIt inputCharIt, CharIt inputEndIt, std::u32string& outStrUtf32)
97 {
98 if (static_cast<std::uint8_t>(*inputCharIt) < 128)
99 {
100 outStrUtf32.push_back(static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt)));
101 return ++inputCharIt;
102 }
103
104 // Some useful precomputed data
105 static const std::uint32_t offsetsMap[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
106 static const std::uint8_t trailingMap[128] =
107 {
108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
111 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
112 };
113
114 // decode the character
115 const std::uint8_t trailingBytes = trailingMap[static_cast<std::uint8_t>(*inputCharIt) - 128];
116 const std::uint32_t offset = offsetsMap[trailingBytes];
117 const auto remainingBytes = std::distance(inputCharIt, inputEndIt) - 1;
118 if (remainingBytes >= static_cast<decltype(remainingBytes)>(trailingBytes))
119 {
120 char32_t outputChar = 0;
121 for (std::uint8_t i = 0; i < trailingBytes; ++i)
122 {
123 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
124 outputChar <<= 6;
125 }
126
127 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
128 outputChar -= offset;
129 outStrUtf32.push_back(outputChar);
130 }
131 else // Incomplete character
132 inputCharIt = inputEndIt;
133
134 return inputCharIt;
135 }
136
137#if defined(__cpp_lib_char8_t) && (__cpp_lib_char8_t >= 201811L)
143 TGUI_NODISCARD inline std::u8string convertUtf32toUtf8(const std::u32string& strUtf32)
144 {
145 std::u8string outStrUtf8;
146 outStrUtf8.reserve(strUtf32.length() + 1);
147 for (const char32_t& codepoint : strUtf32)
148 encodeCharUtf8(codepoint, outStrUtf8);
149
150 return outStrUtf8;
151 }
152#endif
153
160 template <typename CharIt>
161 TGUI_NODISCARD std::u32string convertUtf8toUtf32(CharIt inputBegin, CharIt inputEnd)
162 {
163 std::u32string outStrUtf32;
164 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
165
166 auto it = inputBegin;
167 while (it < inputEnd)
168 it = decodeCharUtf8(it, inputEnd, outStrUtf32);
169
170 return outStrUtf32;
171 }
172
179 template <typename U16CharIt>
180 TGUI_NODISCARD std::u32string convertUtf16toUtf32(U16CharIt inputBegin, U16CharIt inputEnd)
181 {
182 std::u32string outStrUtf32;
183 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
184
185 auto it = inputBegin;
186 while (it < inputEnd)
187 {
188 const char16_t first = *it++;
189
190 // Copy the character if it isn't a surrogate pair
191 if ((first < 0xD800) || (first > 0xDBFF))
192 {
193 outStrUtf32.push_back(static_cast<char32_t>(first));
194 continue;
195 }
196
197 // We need to read another character
198 if (it == inputEnd)
199 break;
200
201 const char16_t second = *it++;
202 if ((second >= 0xDC00) && (second <= 0xDFFF))
203 outStrUtf32.push_back(((static_cast<char32_t>(first) - 0xD800) << 10) + (static_cast<char32_t>(second) - 0xDC00) + 0x0010000);
204 }
205
206 return outStrUtf32;
207 }
208
214 template <typename WCharIt>
215 TGUI_NODISCARD std::u32string convertWidetoUtf32(WCharIt inputBegin, WCharIt inputEnd)
216 {
217 std::u32string outStrUtf32;
218 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
219
220 // std::wstring uses UCS-2 on Windows and UCS-4 on unix, so we can be cast directly
221 for (auto it = inputBegin; it != inputEnd; ++it)
222 outStrUtf32.push_back(static_cast<char32_t>(*it));
223
224 return outStrUtf32;
225 }
226
232 TGUI_NODISCARD inline std::string convertUtf32toStdStringUtf8(const std::u32string& strUtf32)
233 {
234 std::string outStrUtf8;
235 outStrUtf8.reserve(strUtf32.length() + 1);
236 for (const char32_t codepoint : strUtf32)
237 encodeCharUtf8(codepoint, outStrUtf8);
238
239 return outStrUtf8;
240 }
241
247 TGUI_NODISCARD inline std::wstring convertUtf32toWide(const std::u32string& strUtf32)
248 {
249 std::wstring outStr;
250 outStr.reserve(strUtf32.length() + 1);
251
252 TGUI_IF_CONSTEXPR (sizeof(wchar_t) == 4)
253 {
254 // On Unix, wide characters are UCS-4 and we can just copy the characters
255 for (const char32_t codepoint : strUtf32)
256 outStr.push_back(static_cast<wchar_t>(codepoint));
257 }
258 else
259 {
260 // On Windows, wide characters are UCS-2. We just drop the characters that don't fit within a single wide character here.
261 for (const char32_t codepoint : strUtf32)
262 {
263 if ((codepoint < 0xD800) || ((codepoint > 0xDFFF) && (codepoint <= 0xFFFF)))
264 outStr.push_back(static_cast<wchar_t>(codepoint));
265 }
266 }
267
268 return outStr;
269 }
270
276 TGUI_NODISCARD inline std::u16string convertUtf32toUtf16(const std::u32string& strUtf32)
277 {
278 std::u16string outStrUtf16;
279 outStrUtf16.reserve(strUtf32.length() + 1);
280
281 for (const char32_t codepoint : strUtf32)
282 {
283 // If the codepoint fitst inside 2 bytes and it would represent a valid character then just copy it
284 if (codepoint <= 0xFFFF)
285 {
286 if ((codepoint < 0xD800) || (codepoint > 0xDFFF))
287 outStrUtf16.push_back(static_cast<char16_t>(codepoint));
288
289 continue;
290 }
291 else if (codepoint > 0x0010FFFF)
292 continue; // Invalid character (greater than the maximum Unicode value)
293
294 // The input character needs be converted to two UTF-16 elements
295 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) >> 10) + 0xD800));
296 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) & 0x3FFUL) + 0xDC00));
297 }
298
299 return outStrUtf16;
300 }
301
303 }
304}
305
307
308#endif // TGUI_UTF_HPP
Namespace that contains all TGUI functions and classes.
Definition AbsoluteOrRelativeValue.hpp:36