Skip to content

Commit f6573c2

Browse files
committed
refactor string
1 parent 71f621b commit f6573c2

31 files changed

Lines changed: 428 additions & 533 deletions

README.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
1010
A Lua 5.5 interpreter written in pure Rust. Faithfully ported from the official C Lua source code architecture — register-based VM, incremental/generational GC, string interning — and **passes the official Lua 5.5 test suite** (`all.lua` — 28/30 test files, 435 unit tests).
1111

12+
Lua strings in luars are stored as byte strings with an optional UTF-8 text view. At the Lua level this matches Lua's byte-oriented string semantics; on the Rust side you can choose `as_str()` for text or `as_bytes()` / `create_bytes()` when exact bytes matter.
13+
1214
## Highlights
1315

1416
- **Lua 5.5** — compiler, VM, and standard libraries implement the Lua 5.5 specification
@@ -79,6 +81,17 @@ vm.register_type_of::<Point>("Point")?;
7981
vm.register_enum::<Color>("Color")?;
8082
```
8183

84+
### Strings And Bytes
85+
86+
```rust
87+
let text = vm.create_string("hello")?;
88+
assert_eq!(text.as_str(), Some("hello"));
89+
90+
let raw = vm.create_bytes(&[0xff, 0x00, b'A'])?;
91+
assert_eq!(raw.as_str(), None);
92+
assert_eq!(raw.as_bytes(), Some(&[0xff, 0x00, b'A'][..]));
93+
```
94+
8295
### Tables
8396

8497
```rust
@@ -236,7 +249,7 @@ For the full list of behavioral differences, see [docs/Different.md](docs/Differ
236249
- **No C API / C module loading** — pure Rust, no `lua_State*` interface
237250
- **No debug hooks**`debug.sethook` is a stub; `getinfo` / `getlocal` / `traceback` work
238251
- **Own bytecode format**`string.dump` output is not compatible with C Lua
239-
- **UTF-8 strings**no arbitrary binary bytes (use the separate `binary` type)
252+
- **Rust text view is explicit**Lua strings are byte strings, but Rust-side `as_str()` only succeeds for valid UTF-8; use `as_bytes()` for exact byte access
240253

241254
## Building
242255

crates/luars/src/compiler/expr_parser.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,7 @@ fn simpleexp(fs: &mut FuncState, v: &mut ExpDesc) -> Result<(), String> {
134134
let string_content = parse_string_token_value(text, fs.lexer.current_token());
135135
match string_content {
136136
Ok(bytes) => {
137-
let string = match String::from_utf8(bytes.clone()) {
138-
Ok(s) => fs.vm.create_string(&s).unwrap(),
139-
Err(_) => fs.vm.create_binary(bytes).unwrap(),
140-
};
137+
let string = fs.vm.create_bytes(&bytes).unwrap();
141138
// Create VKSTR expression (not VK!) - will convert to VK when needed
142139
*v = ExpDesc::new_vkstr(string);
143140
}

crates/luars/src/gc/gc_kind.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,4 @@ pub enum GcObjectKind {
99
Upvalue = 5,
1010
Thread = 6,
1111
Userdata = 7,
12-
Binary = 8,
1312
}

crates/luars/src/gc/gc_object.rs

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,6 @@ impl<T> HasGcHeader for Gc<T> {
305305
}
306306

307307
pub type GcString = Gc<LuaString>;
308-
pub type GcBinary = Gc<Vec<u8>>;
309308
pub type GcTable = Gc<LuaTable>;
310309
pub type GcFunction = Gc<LuaFunction>;
311310
pub type GcCClosure = Gc<CClosureFunction>;
@@ -401,7 +400,6 @@ impl<T: HasGcHeader> GcPtr<T> {
401400
}
402401

403402
pub type UpvaluePtr = GcPtr<GcUpvalue>;
404-
pub type BinaryPtr = GcPtr<GcBinary>;
405403
pub type TablePtr = GcPtr<GcTable>;
406404
pub type StringPtr = GcPtr<GcString>;
407405
pub type FunctionPtr = GcPtr<GcFunction>;
@@ -454,8 +452,6 @@ impl GcObjectPtr {
454452
const TAG_UPVALUE: u64 = 5;
455453
const TAG_THREAD: u64 = 6;
456454
const TAG_USERDATA: u64 = 7;
457-
const TAG_BINARY: u64 = 8;
458-
459455
#[inline(always)]
460456
fn new_tagged(ptr: u64, tag: u64) -> Self {
461457
debug_assert!(
@@ -569,12 +565,6 @@ impl GcObjectPtr {
569565
UserdataPtr::from_raw(self.raw_ptr())
570566
}
571567

572-
#[inline(always)]
573-
pub fn as_binary_ptr(&self) -> BinaryPtr {
574-
debug_assert!(self.tag() == Self::TAG_BINARY as u8);
575-
BinaryPtr::from_raw(self.raw_ptr())
576-
}
577-
578568
// ============ Pattern matching helpers (for code that still uses if-let) ============
579569

580570
#[inline(always)]
@@ -616,11 +606,6 @@ impl GcObjectPtr {
616606
pub fn is_userdata(&self) -> bool {
617607
self.tag() == Self::TAG_USERDATA as u8
618608
}
619-
620-
#[inline(always)]
621-
pub fn is_binary(&self) -> bool {
622-
self.tag() == Self::TAG_BINARY as u8
623-
}
624609
}
625610

626611
impl From<StringPtr> for GcObjectPtr {
@@ -630,13 +615,6 @@ impl From<StringPtr> for GcObjectPtr {
630615
}
631616
}
632617

633-
impl From<BinaryPtr> for GcObjectPtr {
634-
#[inline(always)]
635-
fn from(ptr: BinaryPtr) -> Self {
636-
Self::new_tagged(ptr.as_u64(), Self::TAG_BINARY)
637-
}
638-
}
639-
640618
impl From<TablePtr> for GcObjectPtr {
641619
#[inline(always)]
642620
fn from(ptr: TablePtr) -> Self {
@@ -696,7 +674,6 @@ pub enum GcObjectOwner {
696674
Userdata(Box<GcUserdata>),
697675
CClosure(Box<GcCClosure>),
698676
RClosure(Box<GcRClosure>),
699-
Binary(Box<GcBinary>),
700677
}
701678

702679
impl GcObjectOwner {
@@ -706,7 +683,6 @@ impl GcObjectOwner {
706683
pub fn compute_size(&self) -> usize {
707684
match self {
708685
GcObjectOwner::String(s) => std::mem::size_of::<GcString>() + s.data.str.len(),
709-
GcObjectOwner::Binary(b) => std::mem::size_of::<GcBinary>() + b.data.len(),
710686
GcObjectOwner::Table(t) => {
711687
let base = std::mem::size_of::<GcTable>();
712688
let asize = t.data.impl_table.asize as usize;
@@ -750,7 +726,6 @@ impl GcObjectOwner {
750726
GcObjectOwner::Upvalue(u) => &u.header,
751727
GcObjectOwner::Thread(t) => &t.header,
752728
GcObjectOwner::Userdata(u) => &u.header,
753-
GcObjectOwner::Binary(b) => &b.header,
754729
}) as _
755730
}
756731

@@ -764,7 +739,6 @@ impl GcObjectOwner {
764739
GcObjectOwner::Upvalue(u) => &mut u.header,
765740
GcObjectOwner::Thread(t) => &mut t.header,
766741
GcObjectOwner::Userdata(u) => &mut u.header,
767-
GcObjectOwner::Binary(b) => &mut b.header,
768742
}) as _
769743
}
770744

@@ -812,13 +786,6 @@ impl GcObjectOwner {
812786
}
813787
}
814788

815-
pub fn as_binary_ptr(&self) -> Option<BinaryPtr> {
816-
match self {
817-
GcObjectOwner::Binary(b) => Some(BinaryPtr::new(b.as_ref() as *const GcBinary)),
818-
_ => None,
819-
}
820-
}
821-
822789
pub fn as_closure_ptr(&self) -> Option<CClosurePtr> {
823790
match self {
824791
GcObjectOwner::CClosure(c) => Some(CClosurePtr::new(c.as_ref() as *const GcCClosure)),
@@ -853,9 +820,6 @@ impl GcObjectOwner {
853820
GcObjectOwner::Userdata(u) => {
854821
GcObjectPtr::from(UserdataPtr::new(u.as_ref() as *const GcUserdata))
855822
}
856-
GcObjectOwner::Binary(b) => {
857-
GcObjectPtr::from(BinaryPtr::new(b.as_ref() as *const GcBinary))
858-
}
859823
GcObjectOwner::CClosure(c) => {
860824
GcObjectPtr::from(CClosurePtr::new(c.as_ref() as *const GcCClosure))
861825
}

crates/luars/src/gc/mod.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,7 @@ impl GC {
772772
/// For other objects: returns true if white (will be collected)
773773
fn is_cleared(&mut self, l: &mut LuaState, gc_ptr: GcObjectPtr) -> bool {
774774
match gc_ptr.kind() {
775-
GcObjectKind::String | GcObjectKind::Binary => {
775+
GcObjectKind::String => {
776776
self.mark_object(l, gc_ptr);
777777
false
778778
}
@@ -3383,8 +3383,8 @@ impl GC {
33833383
);
33843384
}
33853385
}
3386-
// String/Binary have no GC references
3387-
GcObjectOwner::String(_) | GcObjectOwner::Binary(_) => {}
3386+
// Strings have no GC references
3387+
GcObjectOwner::String(_) => {}
33883388
}
33893389
}
33903390
}
@@ -3543,7 +3543,6 @@ impl GC {
35433543
GcObjectOwner::CClosure(b) => b.as_ref() as *const _ as u64,
35443544
GcObjectOwner::RClosure(b) => b.as_ref() as *const _ as u64,
35453545
GcObjectOwner::String(b) => b.as_ref() as *const _ as u64,
3546-
GcObjectOwner::Binary(b) => b.as_ref() as *const _ as u64,
35473546
GcObjectOwner::Thread(b) => b.as_ref() as *const _ as u64,
35483547
GcObjectOwner::Upvalue(b) => b.as_ref() as *const _ as u64,
35493548
GcObjectOwner::Userdata(b) => b.as_ref() as *const _ as u64,
@@ -3969,7 +3968,7 @@ impl GC {
39693968
/// ```.
39703969
fn really_mark_object(&mut self, l: &mut LuaState, gc_ptr: GcObjectPtr) {
39713970
self.gc_marked += 64; // fixed estimate; exact size unavailable from GcObjectPtr
3972-
if gc_ptr.is_string() || gc_ptr.is_binary() {
3971+
if gc_ptr.is_string() {
39733972
gc_ptr.header_mut().unwrap().make_black(); // Leaves become black immediately
39743973
} else if gc_ptr.is_upvalue() {
39753974
let uv_ptr = gc_ptr.as_upvalue_ptr();

crates/luars/src/gc/object_allocator.rs

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use crate::lua_value::{
1515
};
1616
use crate::lua_vm::{CFunction, LuaState};
1717
use crate::{
18-
GC, GcBinary, GcCClosure, GcFunction, GcObjectOwner, GcRClosure, GcTable, GcThread, GcUpvalue,
18+
GC, GcCClosure, GcFunction, GcObjectOwner, GcRClosure, GcTable, GcThread, GcUpvalue,
1919
GcUserdata, LuaFunction, LuaResult, LuaTable, LuaValue, StringPtr, UpvaluePtr,
2020
};
2121
use std::rc::Rc;
@@ -59,17 +59,18 @@ impl ObjectAllocator {
5959
self.strings.intern_owned(s, gc)
6060
}
6161

62-
/// Create a binary value from Vec<u8>
63-
///
62+
/// Create a Lua string-like value from raw bytes.
63+
/// All short byte strings are interned so Lua string equality keeps its fast path.
64+
#[inline]
65+
pub fn create_bytes(&mut self, gc: &mut GC, bytes: &[u8]) -> CreateResult {
66+
self.strings.intern_bytes(bytes, gc)
67+
}
68+
69+
/// Create a raw byte string from Vec<u8> without requiring UTF-8.
70+
/// This compatibility path now uses the same byte-string interning rules as `create_bytes`.
6471
#[inline]
6572
pub fn create_binary(&mut self, gc: &mut GC, data: Vec<u8>) -> CreateResult {
66-
let current_white = gc.current_white;
67-
let size = (std::mem::size_of::<GcBinary>() + data.len()) as u32;
68-
let gc_ptr = Box::new(GcBinary::new(data, current_white, size));
69-
let gc_binary = GcObjectOwner::Binary(gc_ptr);
70-
let ptr = gc_binary.as_binary_ptr().unwrap();
71-
gc.trace_object(gc_binary)?;
72-
Ok(LuaValue::binary(ptr))
73+
self.strings.intern_bytes_owned(data, gc)
7374
}
7475

7576
/// Create a substring from an existing string (optimized for string.sub)
@@ -84,14 +85,8 @@ impl ObjectAllocator {
8485
start: usize,
8586
end: usize,
8687
) -> CreateResult {
87-
// Get bytes - handle both string and binary types
88-
let source_str = s_value.as_str();
89-
let source_is_ascii = source_str.is_some_and(str::is_ascii);
90-
let bytes = if let Some(s) = source_str {
91-
s.as_bytes()
92-
} else if let Some(b) = s_value.as_binary() {
93-
b
94-
} else {
88+
let source_is_ascii = s_value.as_str().is_some_and(str::is_ascii);
89+
let Some(bytes) = s_value.as_bytes() else {
9590
return self.create_string(gc, "");
9691
};
9792

@@ -121,17 +116,7 @@ impl ObjectAllocator {
121116
std::str::from_utf8_unchecked(substring_bytes)
122117
})
123118
} else {
124-
match std::str::from_utf8(substring_bytes) {
125-
Ok(valid_str) => {
126-
// Valid UTF-8 - intern as string
127-
self.create_string(gc, valid_str)
128-
}
129-
Err(_) => {
130-
// Invalid UTF-8 - create binary value to preserve original bytes
131-
// This is important for binary data like bytecode
132-
self.create_binary(gc, substring_bytes.to_vec())
133-
}
134-
}
119+
self.create_bytes(gc, substring_bytes)
135120
}
136121
}
137122

0 commit comments

Comments
 (0)