Skip to content

Commit 71d55b5

Browse files
authored
Merge pull request #5277 from diffblue/unicode
unicode.h: introduce utf8_to_utf32
2 parents e4c80a5 + 61aafe7 commit 71d55b5

File tree

2 files changed

+136
-129
lines changed

2 files changed

+136
-129
lines changed

src/util/unicode.cpp

Lines changed: 130 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -18,124 +18,117 @@ Author: Daniel Kroening, kroening@kroening.com
1818
#include "invariant.h"
1919

2020
#ifdef _WIN32
21-
#include <util/pragma_push.def>
22-
#ifdef _MSC_VER
23-
#pragma warning(disable:4668)
24-
// using #if/#elif on undefined macro
25-
#pragma warning(disable : 5039)
21+
# include <util/pragma_push.def>
22+
# ifdef _MSC_VER
23+
# pragma warning(disable : 4668)
24+
// using #if/#elif on undefined macro
25+
# pragma warning(disable : 5039)
2626
// pointer or reference to potentially throwing function passed to extern C
27-
#endif
28-
#include <windows.h>
29-
#include <util/pragma_pop.def>
27+
# endif
28+
# include <util/pragma_pop.def>
29+
# include <windows.h>
3030
#endif
3131

32+
static void utf8_append_code(unsigned int c, std::string &);
33+
3234
std::string narrow(const wchar_t *s)
3335
{
34-
#ifdef _WIN32
36+
#ifdef _WIN32
3537

36-
int slength=static_cast<int>(wcslen(s));
37-
int rlength=
38+
int slength = static_cast<int>(wcslen(s));
39+
int rlength =
3840
WideCharToMultiByte(CP_UTF8, 0, s, slength, NULL, 0, NULL, NULL);
3941
std::string r(rlength, 0);
4042
WideCharToMultiByte(CP_UTF8, 0, s, slength, &r[0], rlength, NULL, NULL);
4143
return r;
4244

43-
#else
44-
// dummy conversion
45-
std::string r;
46-
r.reserve(wcslen(s));
47-
while(*s!=0)
48-
{
49-
r+=static_cast<char>(*s);
50-
s++;
51-
}
52-
53-
return r;
54-
#endif
45+
#else
46+
return narrow(std::wstring(s));
47+
#endif
5548
}
5649

5750
std::wstring widen(const char *s)
5851
{
59-
#ifdef _WIN32
52+
#ifdef _WIN32
6053

61-
int slength=static_cast<int>(strlen(s));
62-
int rlength=
63-
MultiByteToWideChar(CP_UTF8, 0, s, slength, NULL, 0);
54+
int slength = static_cast<int>(strlen(s));
55+
int rlength = MultiByteToWideChar(CP_UTF8, 0, s, slength, NULL, 0);
6456
std::wstring r(rlength, 0);
6557
MultiByteToWideChar(CP_UTF8, 0, s, slength, &r[0], rlength);
6658
return r;
6759

68-
#else
69-
// dummy conversion
70-
std::wstring r;
71-
r.reserve(strlen(s));
72-
while(*s!=0)
73-
{
74-
r+=wchar_t(*s);
75-
s++;
76-
}
77-
78-
return r;
79-
#endif
60+
#else
61+
return widen(std::string(s));
62+
#endif
8063
}
8164

8265
std::string narrow(const std::wstring &s)
8366
{
84-
#ifdef _WIN32
67+
#ifdef _WIN32
8568

86-
int slength=static_cast<int>(s.size());
87-
int rlength=
69+
int slength = static_cast<int>(s.size());
70+
int rlength =
8871
WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, NULL, 0, NULL, NULL);
8972
std::string r(rlength, 0);
9073
WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, &r[0], rlength, NULL, NULL);
9174
return r;
9275

93-
#else
94-
// dummy conversion
95-
return std::string(s.begin(), s.end());
96-
#endif
76+
#else
77+
std::string result;
78+
79+
result.reserve(s.size()); // at least that long
80+
81+
for(const auto codepoint : s)
82+
utf8_append_code(codepoint, result);
83+
84+
return result;
85+
#endif
9786
}
9887

9988
std::wstring widen(const std::string &s)
10089
{
101-
#ifdef _WIN32
90+
#ifdef _WIN32
10291

103-
int slength=static_cast<int>(s.size());
104-
int rlength=
105-
MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
92+
int slength = static_cast<int>(s.size());
93+
int rlength = MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
10694
std::wstring r(rlength, 0);
10795
MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, &r[0], rlength);
10896
return r;
10997

110-
#else
111-
// dummy conversion
112-
return std::wstring(s.begin(), s.end());
113-
#endif
98+
#else
99+
auto utf32 = utf8_to_utf32(std::string(s));
100+
101+
std::wstring r;
102+
r.reserve(utf32.size());
103+
for(auto codepoint : utf32)
104+
r += codepoint;
105+
return r;
106+
#endif
114107
}
115108

116109
/// Appends a unicode character to a utf8-encoded string
117110
/// \par parameters: character to append, string to append to
118111
static void utf8_append_code(unsigned int c, std::string &result)
119112
{
120-
if(c<=0x7f)
121-
result+=static_cast<char>(c);
122-
else if(c<=0x7ff)
113+
if(c <= 0x7f)
114+
result += static_cast<char>(c);
115+
else if(c <= 0x7ff)
123116
{
124-
result+=static_cast<char>((c >> 6) | 0xc0);
125-
result+=static_cast<char>((c &0x3f) | 0x80);
117+
result += static_cast<char>((c >> 6) | 0xc0);
118+
result += static_cast<char>((c & 0x3f) | 0x80);
126119
}
127-
else if(c<=0xffff)
120+
else if(c <= 0xffff)
128121
{
129-
result+=static_cast<char>((c >> 12) | 0xe0);
130-
result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
131-
result+=static_cast<char>((c &0x3f) | 0x80);
122+
result += static_cast<char>((c >> 12) | 0xe0);
123+
result += static_cast<char>(((c >> 6) & 0x3f) | 0x80);
124+
result += static_cast<char>((c & 0x3f) | 0x80);
132125
}
133126
else
134127
{
135-
result+=static_cast<char>((c >> 18) | 0xf0);
136-
result+=static_cast<char>(((c >> 12) &0x3f)| 0x80);
137-
result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
138-
result+=static_cast<char>((c &0x3f) | 0x80);
128+
result += static_cast<char>((c >> 18) | 0xf0);
129+
result += static_cast<char>(((c >> 12) & 0x3f) | 0x80);
130+
result += static_cast<char>(((c >> 6) & 0x3f) | 0x80);
131+
result += static_cast<char>((c & 0x3f) | 0x80);
139132
}
140133
}
141134

@@ -156,13 +149,13 @@ utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
156149

157150
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
158151
{
159-
if(argv_wide==nullptr)
152+
if(argv_wide == nullptr)
160153
return std::vector<std::string>();
161154

162155
std::vector<std::string> argv_narrow;
163156
argv_narrow.reserve(argc);
164157

165-
for(int i=0; i!=argc; ++i)
158+
for(int i = 0; i != argc; ++i)
166159
argv_narrow.push_back(narrow(argv_wide[i]));
167160

168161
return argv_narrow;
@@ -173,7 +166,7 @@ static void utf16_append_code(unsigned int code, std::wstring &result)
173166
// we do not treat 0xD800 to 0xDFFF, although
174167
// they are not valid unicode symbols
175168

176-
if(code<0xFFFF)
169+
if(code < 0xFFFF)
177170
{
178171
// code is encoded as one UTF16 character
179172
result += static_cast<wchar_t>(code);
@@ -185,76 +178,89 @@ static void utf16_append_code(unsigned int code, std::wstring &result)
185178
// but let's not check it programmatically
186179

187180
// encode the code in UTF16
188-
code=code-0x10000;
181+
code = code - 0x10000;
189182
const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
190183
result += static_cast<wchar_t>(i1);
191184
const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
192185
result += static_cast<wchar_t>(i2);
193186
}
194187
}
195188

196-
197189
/// Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
198190
/// \par parameters: String in UTF-8 format
199191
/// \return String in UTF-16 format. The encoding follows the endianness of the
200192
/// architecture iff swap_bytes is true.
201193
std::wstring utf8_to_utf16_native_endian(const std::string &in)
202194
{
203-
std::wstring result;
204-
result.reserve(in.size());
205-
std::string::size_type i=0;
206-
while(i<in.size())
195+
std::wstring result;
196+
result.reserve(in.size());
197+
198+
for(auto codepoint : utf8_to_utf32(in))
199+
utf16_append_code(codepoint, result);
200+
201+
return result;
202+
}
203+
204+
/// Convert UTF8-encoded string to UTF-32 with architecture-native endianness.
205+
/// \par parameters: String in UTF-8 format
206+
/// \return String in UTF-32 format.
207+
std::u32string utf8_to_utf32(const std::string &utf8_str)
208+
{
209+
std::u32string result;
210+
result.reserve(utf8_str.size());
211+
std::string::size_type i = 0;
212+
while(i < utf8_str.size())
213+
{
214+
unsigned char c = utf8_str[i++];
215+
char32_t code = 0;
216+
// the ifs that follow find out how many UTF8 characters (1-4) store the
217+
// next unicode character. This is determined by the few most
218+
// significant bits.
219+
if(c <= 0x7F)
207220
{
208-
unsigned char c=in[i++];
209-
unsigned int code=0;
210-
// the ifs that follow find out how many UTF8 characters (1-4) store the
211-
// next unicode character. This is determined by the few most
212-
// significant bits.
213-
if(c<=0x7F)
214-
{
215-
// if it's one character, then code is exactly the value
216-
code=c;
217-
}
218-
else if(c<=0xDF && i<in.size())
219-
{ // in other cases, we need to read the right number of chars and decode
220-
// note: if we wanted to make sure that we capture incorrect strings,
221-
// we should check that whatever follows first character starts with
222-
// bits 10.
223-
code = (c & 0x1Fu) << 6;
224-
c=in[i++];
225-
code += c & 0x3Fu;
226-
}
227-
else if(c<=0xEF && i+1<in.size())
228-
{
229-
code = (c & 0xFu) << 12;
230-
c=in[i++];
231-
code += (c & 0x3Fu) << 6;
232-
c=in[i++];
233-
code += c & 0x3Fu;
234-
}
235-
else if(c<=0xF7 && i+2<in.size())
236-
{
237-
code = (c & 0x7u) << 18;
238-
c=in[i++];
239-
code += (c & 0x3Fu) << 12;
240-
c=in[i++];
241-
code += (c & 0x3Fu) << 6;
242-
c=in[i++];
243-
code += c & 0x3Fu;
244-
}
245-
else
246-
{
247-
// The string is not a valid UTF8 string! Either it has some characters
248-
// missing from a multi-character unicode symbol, or it has a char with
249-
// too high value.
250-
// For now, let's replace the character with a space
251-
code=32;
252-
}
253-
254-
utf16_append_code(code, result);
221+
// if it's one character, then code is exactly the value
222+
code = c;
223+
}
224+
else if(c <= 0xDF && i < utf8_str.size())
225+
{ // in other cases, we need to read the right number of chars and decode
226+
// note: if we wanted to make sure that we capture incorrect strings,
227+
// we should check that whatever follows first character starts with
228+
// bits 10.
229+
code = (c & 0x1Fu) << 6;
230+
c = utf8_str[i++];
231+
code += c & 0x3Fu;
232+
}
233+
else if(c <= 0xEF && i + 1 < utf8_str.size())
234+
{
235+
code = (c & 0xFu) << 12;
236+
c = utf8_str[i++];
237+
code += (c & 0x3Fu) << 6;
238+
c = utf8_str[i++];
239+
code += c & 0x3Fu;
240+
}
241+
else if(c <= 0xF7 && i + 2 < utf8_str.size())
242+
{
243+
code = (c & 0x7u) << 18;
244+
c = utf8_str[i++];
245+
code += (c & 0x3Fu) << 12;
246+
c = utf8_str[i++];
247+
code += (c & 0x3Fu) << 6;
248+
c = utf8_str[i++];
249+
code += c & 0x3Fu;
250+
}
251+
else
252+
{
253+
// The string is not a valid UTF8 string! Either it has some characters
254+
// missing from a multi-character unicode symbol, or it has a char with
255+
// too high value.
256+
// For now, let's replace the character with a space
257+
code = 32;
255258
}
256259

257-
return result;
260+
result.append(1, code);
261+
}
262+
263+
return result;
258264
}
259265

260266
/// Escapes non-printable characters, whitespace except for spaces, double

src/util/unicode.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ Author: Daniel Kroening, kroening@kroening.com
66
77
\*******************************************************************/
88

9-
109
#ifndef CPROVER_UTIL_UNICODE_H
1110
#define CPROVER_UTIL_UNICODE_H
1211

@@ -25,6 +24,10 @@ std::wstring widen(const std::string &s);
2524
std::string
2625
utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s);
2726

27+
/// \param utf8_str: UTF-8 string
28+
/// \return UTF-32 encoding of the string
29+
std::u32string utf8_to_utf32(const std::string &utf8_str);
30+
2831
std::wstring utf8_to_utf16_native_endian(const std::string &in);
2932
std::string utf16_native_endian_to_java(const char16_t ch);
3033
std::string utf16_native_endian_to_java(const std::wstring &in);
@@ -57,10 +60,8 @@ std::vector<const char *> to_c_str_array(It b, It e)
5760
{
5861
// Assumes that walking the range will be faster than repeated allocation
5962
std::vector<const char *> ret(std::distance(b, e) + 1, nullptr);
60-
std::transform(b, e, std::begin(ret), [] (const std::string & s)
61-
{
62-
return s.c_str();
63-
});
63+
std::transform(
64+
b, e, std::begin(ret), [](const std::string &s) { return s.c_str(); });
6465
return ret;
6566
}
6667

0 commit comments

Comments
 (0)