Skip to content

Commit e3189d0

Browse files
committed
introduce utf8_to_utf32
This is the counterpart to utf32_native_endian_to_utf8. All functions use native endianness, thus no need make this part of the function name.
1 parent f1f853c commit e3189d0

File tree

2 files changed

+31
-13
lines changed

2 files changed

+31
-13
lines changed

src/util/unicode.cpp

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -199,11 +199,25 @@ std::wstring utf8_to_utf16_native_endian(const std::string &in)
199199
{
200200
std::wstring result;
201201
result.reserve(in.size());
202+
203+
for(auto codepoint : utf8_to_utf32(in))
204+
utf16_append_code(codepoint, result);
205+
206+
return result;
207+
}
208+
209+
/// Convert UTF8-encoded string to UTF-32 with architecture-native endianness.
210+
/// \par parameters: String in UTF-8 format
211+
/// \return String in UTF-32 format.
212+
std::u32string utf8_to_utf32(const std::string &utf8_str)
213+
{
214+
std::u32string result;
215+
result.reserve(utf8_str.size());
202216
std::string::size_type i = 0;
203-
while(i < in.size())
217+
while(i < utf8_str.size())
204218
{
205-
unsigned char c = in[i++];
206-
unsigned int code = 0;
219+
unsigned char c = utf8_str[i++];
220+
char32_t code = 0;
207221
// the ifs that follow find out how many UTF8 characters (1-4) store the
208222
// next unicode character. This is determined by the few most
209223
// significant bits.
@@ -212,31 +226,31 @@ std::wstring utf8_to_utf16_native_endian(const std::string &in)
212226
// if it's one character, then code is exactly the value
213227
code = c;
214228
}
215-
else if(c <= 0xDF && i < in.size())
229+
else if(c <= 0xDF && i < utf8_str.size())
216230
{ // in other cases, we need to read the right number of chars and decode
217231
// note: if we wanted to make sure that we capture incorrect strings,
218232
// we should check that whatever follows first character starts with
219233
// bits 10.
220234
code = (c & 0x1Fu) << 6;
221-
c = in[i++];
235+
c = utf8_str[i++];
222236
code += c & 0x3Fu;
223237
}
224-
else if(c <= 0xEF && i + 1 < in.size())
238+
else if(c <= 0xEF && i + 1 < utf8_str.size())
225239
{
226240
code = (c & 0xFu) << 12;
227-
c = in[i++];
241+
c = utf8_str[i++];
228242
code += (c & 0x3Fu) << 6;
229-
c = in[i++];
243+
c = utf8_str[i++];
230244
code += c & 0x3Fu;
231245
}
232-
else if(c <= 0xF7 && i + 2 < in.size())
246+
else if(c <= 0xF7 && i + 2 < utf8_str.size())
233247
{
234248
code = (c & 0x7u) << 18;
235-
c = in[i++];
249+
c = utf8_str[i++];
236250
code += (c & 0x3Fu) << 12;
237-
c = in[i++];
251+
c = utf8_str[i++];
238252
code += (c & 0x3Fu) << 6;
239-
c = in[i++];
253+
c = utf8_str[i++];
240254
code += c & 0x3Fu;
241255
}
242256
else
@@ -248,7 +262,7 @@ std::wstring utf8_to_utf16_native_endian(const std::string &in)
248262
code = 32;
249263
}
250264

251-
utf16_append_code(code, result);
265+
result.append(1, code);
252266
}
253267

254268
return result;

src/util/unicode.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ std::wstring widen(const std::string &s);
2424
std::string
2525
utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s);
2626

27+
/// \param utf8_str: UTF-8 string
28+
/// \return UTF-32 encoding of the string
29+
std::u32string utf8_to_utf32(const std::string &utf8_str);
30+
2731
std::wstring utf8_to_utf16_native_endian(const std::string &in);
2832
std::string utf16_native_endian_to_java(const char16_t ch);
2933
std::string utf16_native_endian_to_java(const std::wstring &in);

0 commit comments

Comments
 (0)