@@ -18,124 +18,117 @@ Author: Daniel Kroening, kroening@kroening.com
1818#include " invariant.h"
1919
2020#ifdef _WIN32
21- #include < util/pragma_push.def>
22- #ifdef _MSC_VER
23- #pragma warning(disable: 4668)
24- // using #if/#elif on undefined macro
25- #pragma warning(disable : 5039)
21+ # include < util/pragma_push.def>
22+ # ifdef _MSC_VER
23+ # pragma warning(disable : 4668)
24+ // using #if/#elif on undefined macro
25+ # pragma warning(disable : 5039)
2626// pointer or reference to potentially throwing function passed to extern C
27- #endif
28- #include < windows.h >
29- #include < util/pragma_pop.def >
27+ # endif
28+ # include < util/pragma_pop.def >
29+ # include < windows.h >
3030#endif
3131
32+ static void utf8_append_code (unsigned int c, std::string &);
33+
3234std::string narrow (const wchar_t *s)
3335{
34- #ifdef _WIN32
36+ #ifdef _WIN32
3537
36- int slength= static_cast <int >(wcslen (s));
37- int rlength=
38+ int slength = static_cast <int >(wcslen (s));
39+ int rlength =
3840 WideCharToMultiByte (CP_UTF8, 0 , s, slength, NULL , 0 , NULL , NULL );
3941 std::string r (rlength, 0 );
4042 WideCharToMultiByte (CP_UTF8, 0 , s, slength, &r[0 ], rlength, NULL , NULL );
4143 return r;
4244
43- #else
44- // dummy conversion
45- std::string r;
46- r.reserve (wcslen (s));
47- while (*s!=0 )
48- {
49- r+=static_cast <char >(*s);
50- s++;
51- }
52-
53- return r;
54- #endif
45+ #else
46+ return narrow (std::wstring (s));
47+ #endif
5548}
5649
5750std::wstring widen (const char *s)
5851{
59- #ifdef _WIN32
52+ #ifdef _WIN32
6053
61- int slength=static_cast <int >(strlen (s));
62- int rlength=
63- MultiByteToWideChar (CP_UTF8, 0 , s, slength, NULL , 0 );
54+ int slength = static_cast <int >(strlen (s));
55+ int rlength = MultiByteToWideChar (CP_UTF8, 0 , s, slength, NULL , 0 );
6456 std::wstring r (rlength, 0 );
6557 MultiByteToWideChar (CP_UTF8, 0 , s, slength, &r[0 ], rlength);
6658 return r;
6759
68- #else
69- // dummy conversion
70- std::wstring r;
71- r.reserve (strlen (s));
72- while (*s!=0 )
73- {
74- r+=wchar_t (*s);
75- s++;
76- }
77-
78- return r;
79- #endif
60+ #else
61+ return widen (std::string (s));
62+ #endif
8063}
8164
8265std::string narrow (const std::wstring &s)
8366{
84- #ifdef _WIN32
67+ #ifdef _WIN32
8568
86- int slength= static_cast <int >(s.size ());
87- int rlength=
69+ int slength = static_cast <int >(s.size ());
70+ int rlength =
8871 WideCharToMultiByte (CP_UTF8, 0 , &s[0 ], slength, NULL , 0 , NULL , NULL );
8972 std::string r (rlength, 0 );
9073 WideCharToMultiByte (CP_UTF8, 0 , &s[0 ], slength, &r[0 ], rlength, NULL , NULL );
9174 return r;
9275
93- #else
94- // dummy conversion
95- return std::string (s.begin (), s.end ());
96- #endif
76+ #else
77+ std::string result;
78+
79+ result.reserve (s.size ()); // at least that long
80+
81+ for (const auto codepoint : s)
82+ utf8_append_code (codepoint, result);
83+
84+ return result;
85+ #endif
9786}
9887
9988std::wstring widen (const std::string &s)
10089{
101- #ifdef _WIN32
90+ #ifdef _WIN32
10291
103- int slength=static_cast <int >(s.size ());
104- int rlength=
105- MultiByteToWideChar (CP_UTF8, 0 , &s[0 ], slength, NULL , 0 );
92+ int slength = static_cast <int >(s.size ());
93+ int rlength = MultiByteToWideChar (CP_UTF8, 0 , &s[0 ], slength, NULL , 0 );
10694 std::wstring r (rlength, 0 );
10795 MultiByteToWideChar (CP_UTF8, 0 , &s[0 ], slength, &r[0 ], rlength);
10896 return r;
10997
110- #else
111- // dummy conversion
112- return std::wstring (s.begin (), s.end ());
113- #endif
98+ #else
99+ auto utf32 = utf8_to_utf32 (std::string (s));
100+
101+ std::wstring r;
102+ r.reserve (utf32.size ());
103+ for (auto codepoint : utf32)
104+ r += codepoint;
105+ return r;
106+ #endif
114107}
115108
116109// / Appends a unicode character to a utf8-encoded string
117110// / \par parameters: character to append, string to append to
118111static void utf8_append_code (unsigned int c, std::string &result)
119112{
120- if (c<= 0x7f )
121- result+= static_cast <char >(c);
122- else if (c<= 0x7ff )
113+ if (c <= 0x7f )
114+ result += static_cast <char >(c);
115+ else if (c <= 0x7ff )
123116 {
124- result+= static_cast <char >((c >> 6 ) | 0xc0 );
125- result+= static_cast <char >((c &0x3f ) | 0x80 );
117+ result += static_cast <char >((c >> 6 ) | 0xc0 );
118+ result += static_cast <char >((c & 0x3f ) | 0x80 );
126119 }
127- else if (c<= 0xffff )
120+ else if (c <= 0xffff )
128121 {
129- result+= static_cast <char >((c >> 12 ) | 0xe0 );
130- result+= static_cast <char >(((c >> 6 ) &0x3f ) | 0x80 );
131- result+= static_cast <char >((c &0x3f ) | 0x80 );
122+ result += static_cast <char >((c >> 12 ) | 0xe0 );
123+ result += static_cast <char >(((c >> 6 ) & 0x3f ) | 0x80 );
124+ result += static_cast <char >((c & 0x3f ) | 0x80 );
132125 }
133126 else
134127 {
135- result+= static_cast <char >((c >> 18 ) | 0xf0 );
136- result+= static_cast <char >(((c >> 12 ) &0x3f )| 0x80 );
137- result+= static_cast <char >(((c >> 6 ) &0x3f ) | 0x80 );
138- result+= static_cast <char >((c &0x3f ) | 0x80 );
128+ result += static_cast <char >((c >> 18 ) | 0xf0 );
129+ result += static_cast <char >(((c >> 12 ) & 0x3f ) | 0x80 );
130+ result += static_cast <char >(((c >> 6 ) & 0x3f ) | 0x80 );
131+ result += static_cast <char >((c & 0x3f ) | 0x80 );
139132 }
140133}
141134
@@ -156,13 +149,13 @@ utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
156149
157150std::vector<std::string> narrow_argv (int argc, const wchar_t **argv_wide)
158151{
159- if (argv_wide== nullptr )
152+ if (argv_wide == nullptr )
160153 return std::vector<std::string>();
161154
162155 std::vector<std::string> argv_narrow;
163156 argv_narrow.reserve (argc);
164157
165- for (int i= 0 ; i!= argc; ++i)
158+ for (int i = 0 ; i != argc; ++i)
166159 argv_narrow.push_back (narrow (argv_wide[i]));
167160
168161 return argv_narrow;
@@ -173,7 +166,7 @@ static void utf16_append_code(unsigned int code, std::wstring &result)
173166 // we do not treat 0xD800 to 0xDFFF, although
174167 // they are not valid unicode symbols
175168
176- if (code< 0xFFFF )
169+ if (code < 0xFFFF )
177170 {
178171 // code is encoded as one UTF16 character
179172 result += static_cast <wchar_t >(code);
@@ -185,76 +178,89 @@ static void utf16_append_code(unsigned int code, std::wstring &result)
185178 // but let's not check it programmatically
186179
187180 // encode the code in UTF16
188- code= code- 0x10000 ;
181+ code = code - 0x10000 ;
189182 const uint16_t i1 = static_cast <uint16_t >(((code >> 10 ) & 0x3ff ) | 0xD800 );
190183 result += static_cast <wchar_t >(i1);
191184 const uint16_t i2 = static_cast <uint16_t >((code & 0x3ff ) | 0xDC00 );
192185 result += static_cast <wchar_t >(i2);
193186 }
194187}
195188
196-
197189// / Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
198190// / \par parameters: String in UTF-8 format
199191// / \return String in UTF-16 format. The encoding follows the endianness of the
200192// / architecture iff swap_bytes is true.
201193std::wstring utf8_to_utf16_native_endian (const std::string &in)
202194{
203- std::wstring result;
204- result.reserve (in.size ());
205- std::string::size_type i=0 ;
206- while (i<in.size ())
195+ std::wstring result;
196+ result.reserve (in.size ());
197+
198+ for (auto codepoint : utf8_to_utf32 (in))
199+ utf16_append_code (codepoint, result);
200+
201+ return result;
202+ }
203+
204+ // / Convert UTF8-encoded string to UTF-32 with architecture-native endianness.
205+ // / \par parameters: String in UTF-8 format
206+ // / \return String in UTF-32 format.
207+ std::u32string utf8_to_utf32 (const std::string &utf8_str)
208+ {
209+ std::u32string result;
210+ result.reserve (utf8_str.size ());
211+ std::string::size_type i = 0 ;
212+ while (i < utf8_str.size ())
213+ {
214+ unsigned char c = utf8_str[i++];
215+ char32_t code = 0 ;
216+ // the ifs that follow find out how many UTF8 characters (1-4) store the
217+ // next unicode character. This is determined by the few most
218+ // significant bits.
219+ if (c <= 0x7F )
207220 {
208- unsigned char c=in[i++];
209- unsigned int code=0 ;
210- // the ifs that follow find out how many UTF8 characters (1-4) store the
211- // next unicode character. This is determined by the few most
212- // significant bits.
213- if (c<=0x7F )
214- {
215- // if it's one character, then code is exactly the value
216- code=c;
217- }
218- else if (c<=0xDF && i<in.size ())
219- { // in other cases, we need to read the right number of chars and decode
220- // note: if we wanted to make sure that we capture incorrect strings,
221- // we should check that whatever follows first character starts with
222- // bits 10.
223- code = (c & 0x1Fu ) << 6 ;
224- c=in[i++];
225- code += c & 0x3Fu ;
226- }
227- else if (c<=0xEF && i+1 <in.size ())
228- {
229- code = (c & 0xFu ) << 12 ;
230- c=in[i++];
231- code += (c & 0x3Fu ) << 6 ;
232- c=in[i++];
233- code += c & 0x3Fu ;
234- }
235- else if (c<=0xF7 && i+2 <in.size ())
236- {
237- code = (c & 0x7u ) << 18 ;
238- c=in[i++];
239- code += (c & 0x3Fu ) << 12 ;
240- c=in[i++];
241- code += (c & 0x3Fu ) << 6 ;
242- c=in[i++];
243- code += c & 0x3Fu ;
244- }
245- else
246- {
247- // The string is not a valid UTF8 string! Either it has some characters
248- // missing from a multi-character unicode symbol, or it has a char with
249- // too high value.
250- // For now, let's replace the character with a space
251- code=32 ;
252- }
253-
254- utf16_append_code (code, result);
221+ // if it's one character, then code is exactly the value
222+ code = c;
223+ }
224+ else if (c <= 0xDF && i < utf8_str.size ())
225+ { // in other cases, we need to read the right number of chars and decode
226+ // note: if we wanted to make sure that we capture incorrect strings,
227+ // we should check that whatever follows first character starts with
228+ // bits 10.
229+ code = (c & 0x1Fu ) << 6 ;
230+ c = utf8_str[i++];
231+ code += c & 0x3Fu ;
232+ }
233+ else if (c <= 0xEF && i + 1 < utf8_str.size ())
234+ {
235+ code = (c & 0xFu ) << 12 ;
236+ c = utf8_str[i++];
237+ code += (c & 0x3Fu ) << 6 ;
238+ c = utf8_str[i++];
239+ code += c & 0x3Fu ;
240+ }
241+ else if (c <= 0xF7 && i + 2 < utf8_str.size ())
242+ {
243+ code = (c & 0x7u ) << 18 ;
244+ c = utf8_str[i++];
245+ code += (c & 0x3Fu ) << 12 ;
246+ c = utf8_str[i++];
247+ code += (c & 0x3Fu ) << 6 ;
248+ c = utf8_str[i++];
249+ code += c & 0x3Fu ;
250+ }
251+ else
252+ {
253+ // The string is not a valid UTF8 string! Either it has some characters
254+ // missing from a multi-character unicode symbol, or it has a char with
255+ // too high value.
256+ // For now, let's replace the character with a space
257+ code = 32 ;
255258 }
256259
257- return result;
260+ result.append (1 , code);
261+ }
262+
263+ return result;
258264}
259265
260266// / Escapes non-printable characters, whitespace except for spaces, double
0 commit comments