@@ -199,11 +199,25 @@ std::wstring utf8_to_utf16_native_endian(const std::string &in)
199199{
200200 std::wstring result;
201201 result.reserve (in.size ());
202+
203+ for (auto codepoint : utf8_to_utf32 (in))
204+ utf16_append_code (codepoint, result);
205+
206+ return result;
207+ }
208+
209+ // / Convert UTF8-encoded string to UTF-32 with architecture-native endianness.
210+ // / \par parameters: String in UTF-8 format
211+ // / \return String in UTF-32 format.
212+ std::u32string utf8_to_utf32 (const std::string &utf8_str)
213+ {
214+ std::u32string result;
215+ result.reserve (utf8_str.size ());
202216 std::string::size_type i = 0 ;
203- while (i < in .size ())
217+ while (i < utf8_str .size ())
204218 {
205- unsigned char c = in [i++];
206- unsigned int code = 0 ;
219+ unsigned char c = utf8_str [i++];
220+ char32_t code = 0 ;
207221 // the ifs that follow find out how many UTF8 characters (1-4) store the
208222 // next unicode character. This is determined by the few most
209223 // significant bits.
@@ -212,31 +226,31 @@ std::wstring utf8_to_utf16_native_endian(const std::string &in)
212226 // if it's one character, then code is exactly the value
213227 code = c;
214228 }
215- else if (c <= 0xDF && i < in .size ())
229+ else if (c <= 0xDF && i < utf8_str .size ())
216230 { // in other cases, we need to read the right number of chars and decode
217231 // note: if we wanted to make sure that we capture incorrect strings,
218232 // we should check that whatever follows first character starts with
219233 // bits 10.
220234 code = (c & 0x1Fu ) << 6 ;
221- c = in [i++];
235+ c = utf8_str [i++];
222236 code += c & 0x3Fu ;
223237 }
224- else if (c <= 0xEF && i + 1 < in .size ())
238+ else if (c <= 0xEF && i + 1 < utf8_str .size ())
225239 {
226240 code = (c & 0xFu ) << 12 ;
227- c = in [i++];
241+ c = utf8_str [i++];
228242 code += (c & 0x3Fu ) << 6 ;
229- c = in [i++];
243+ c = utf8_str [i++];
230244 code += c & 0x3Fu ;
231245 }
232- else if (c <= 0xF7 && i + 2 < in .size ())
246+ else if (c <= 0xF7 && i + 2 < utf8_str .size ())
233247 {
234248 code = (c & 0x7u ) << 18 ;
235- c = in [i++];
249+ c = utf8_str [i++];
236250 code += (c & 0x3Fu ) << 12 ;
237- c = in [i++];
251+ c = utf8_str [i++];
238252 code += (c & 0x3Fu ) << 6 ;
239- c = in [i++];
253+ c = utf8_str [i++];
240254 code += c & 0x3Fu ;
241255 }
242256 else
@@ -248,7 +262,7 @@ std::wstring utf8_to_utf16_native_endian(const std::string &in)
248262 code = 32 ;
249263 }
250264
251- utf16_append_code (code, result );
265+ result. append ( 1 , code );
252266 }
253267
254268 return result;
0 commit comments