Skip to content

Commit 3c51152

Browse files
committed
Rework and fix parsing of unicode strings
We cannot arbitrarily convert strings to unicode representation as this would affect their size. Instead, only the unicode portion of a non-wide string must be encoded. As part of this work the code duplication unescape_string/unescape_wide_string is resolved. A new regression test confirms that the previous implementation was broken (and is now fixed).
1 parent f195d69 commit 3c51152

File tree

4 files changed

+129
-149
lines changed

4 files changed

+129
-149
lines changed

regression/cbmc/hex_string1/main.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#include <assert.h>
2+
3+
#define static_assert(x) ((struct { char some[(x)?1:-1]; }*)0)
4+
5+
int main()
6+
{
7+
static_assert('\xe8'==(char)0xe8);
8+
static_assert(sizeof("abc")==4);
9+
static_assert(sizeof("\u0201")==3);
10+
static_assert(sizeof("\xe8")==2);
11+
static_assert(sizeof("\u0201\xe8")==4);
12+
13+
if("\xe8"[0]!=(char)0xe8)
14+
assert(0);
15+
return 0;
16+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
CORE
2+
main.c
3+
4+
^EXIT=0$
5+
^SIGNAL=0$
6+
^VERIFICATION SUCCESSFUL$
7+
--
8+
^warning: ignoring
9+
^CONVERSION ERROR$

src/ansi-c/literals/convert_string_literal.cpp

Lines changed: 26 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ std::basic_string<unsigned int> convert_one_string_literal(
4747

4848
// pad into wide string
4949
value.resize(utf8_value.size());
50-
for(unsigned i=0; i<utf8_value.size(); i++)
50+
for(std::size_t i=0; i<utf8_value.size(); i++)
5151
value[i]=utf8_value[i];
5252

5353
return value;
@@ -64,16 +64,14 @@ std::basic_string<unsigned int> convert_one_string_literal(
6464
assert(src[0]=='"');
6565
assert(src[src.size()-1]=='"');
6666

67-
std::basic_string<unsigned int> value=
68-
unescape_wide_string(std::string(src, 1, src.size()-2));
69-
70-
// turn into utf-8
71-
std::string utf8_value=utf32_to_utf8(value);
67+
std::string char_value=
68+
unescape_string(std::string(src, 1, src.size()-2));
7269

7370
// pad into wide string
74-
value.resize(utf8_value.size());
75-
for(unsigned i=0; i<utf8_value.size(); i++)
76-
value[i]=utf8_value[i];
71+
std::basic_string<unsigned int> value;
72+
value.resize(char_value.size());
73+
for(std::size_t i=0; i<char_value.size(); i++)
74+
value[i]=char_value[i];
7775

7876
return value;
7977
}
@@ -101,7 +99,7 @@ exprt convert_string_literal(const std::string &src)
10199

102100
char wide=0;
103101

104-
for(unsigned i=0; i<src.size(); i++)
102+
for(std::size_t i=0; i<src.size(); i++)
105103
{
106104
char ch=src[i];
107105

@@ -115,27 +113,24 @@ exprt convert_string_literal(const std::string &src)
115113
wide=ch;
116114

117115
// find start of sequence
118-
unsigned j=i;
119-
while(j<src.size() && src[j]!='"') j++;
116+
std::size_t j=src.find('"', i);
117+
if(j==std::string::npos)
118+
throw "invalid string constant `"+src+"'";
120119

121120
// find end of sequence, considering escaping
122-
j++;
123-
while(j<src.size() && src[j]!='"')
124-
{
125-
if(src[j]=='\\')
126-
j+=2;
127-
else
128-
j++;
129-
}
130-
131-
if(j<src.size())
132-
{
133-
std::string tmp_src=std::string(src, i, j-i+1);
134-
std::basic_string<unsigned int> tmp_value=
135-
convert_one_string_literal(tmp_src);
136-
value.append(tmp_value);
137-
i=j;
138-
}
121+
for(++j; j<src.size() && src[j]!='"'; ++j)
122+
if(src[j]=='\\') // skip next character
123+
++j;
124+
125+
assert(j<=src.size());
126+
if(j==src.size())
127+
throw "non-terminated string constant `"+src+"'";
128+
129+
std::string tmp_src=std::string(src, i, j-i+1);
130+
std::basic_string<unsigned int> tmp_value=
131+
convert_one_string_literal(tmp_src);
132+
value.append(tmp_value);
133+
i=j;
139134
}
140135

141136
if(wide!=0)
@@ -161,7 +156,7 @@ exprt convert_string_literal(const std::string &src)
161156
result.type().set(ID_size, from_integer(value.size(), index_type()));
162157

163158
result.operands().resize(value.size());
164-
for(unsigned i=0; i<value.size(); i++)
159+
for(std::size_t i=0; i<value.size(); i++)
165160
result.operands()[i]=from_integer(value[i], subtype);
166161

167162
return result;
@@ -172,7 +167,7 @@ exprt convert_string_literal(const std::string &src)
172167

173168
char_value.resize(value.size());
174169

175-
for(unsigned i=0; i<value.size(); i++)
170+
for(std::size_t i=0; i<value.size(); i++)
176171
{
177172
// Loss of data here if value[i]>255.
178173
// gcc issues a warning in this case.

0 commit comments

Comments
 (0)