diff --git a/core/string/src/builder.rs b/core/string/src/builder.rs index 6abf29bb7fd..72a018dd0e1 100644 --- a/core/string/src/builder.rs +++ b/core/string/src/builder.rs @@ -1,4 +1,4 @@ -use crate::vtable::sequence::DATA_OFFSET; +use crate::r#type::{Latin1, StringType, Utf16}; use crate::{JsStr, JsStrVariant, JsString, SequenceString, alloc_overflow}; use std::{ alloc::{Layout, alloc, dealloc, realloc}, @@ -8,23 +8,25 @@ use std::{ str::{self}, }; -/// A mutable builder to create instance of `JsString`. +/// A mutable builder to create instances of `JsString`. #[derive(Debug)] -pub struct JsStringBuilder { +#[allow(private_bounds)] +pub struct JsStringBuilder { cap: usize, len: usize, - inner: NonNull, + inner: NonNull>, phantom_data: PhantomData, } -impl Default for JsStringBuilder { +impl Default for JsStringBuilder { fn default() -> Self { Self::new() } } -impl JsStringBuilder { - const DATA_SIZE: usize = size_of::(); +#[allow(private_bounds)] +impl JsStringBuilder { + const DATA_SIZE: usize = size_of::(); const MIN_NON_ZERO_CAP: usize = 8 / Self::DATA_SIZE; /// Create a new `JsStringBuilder` with capacity of zero. @@ -75,7 +77,7 @@ impl JsStringBuilder { /// Returns the capacity calculated from given layout. #[must_use] const fn capacity_from_layout(layout: Layout) -> usize { - (layout.size() - DATA_OFFSET) / Self::DATA_SIZE + (layout.size() - D::DATA_OFFSET) / Self::DATA_SIZE } /// Create a new `JsStringBuilder` with specific capacity @@ -109,7 +111,7 @@ impl JsStringBuilder { self.inner != NonNull::dangling() } - /// Returns the inner `RawJsString`'s layout. + /// Returns the inner sequence string's layout. /// /// # Safety /// @@ -120,7 +122,7 @@ impl JsStringBuilder { // Caller should ensure that the inner is allocated. unsafe { Layout::for_value(self.inner.as_ref()) - .extend(Layout::array::(self.capacity()).unwrap_unchecked()) + .extend(Layout::array::(self.capacity()).unwrap_unchecked()) .unwrap_unchecked() .0 .pad_to_align() @@ -133,10 +135,10 @@ impl JsStringBuilder { /// /// Caller should ensure that the inner is allocated. #[must_use] - const unsafe fn data(&self) -> *mut D { - let seq_ptr = self.inner.as_ptr().cast::(); + const unsafe fn data(&self) -> *mut D::Char { + let seq_ptr: *mut D::Char = self.inner.as_ptr().cast(); // SAFETY: Caller should ensure that the inner is allocated. - unsafe { seq_ptr.add(DATA_OFFSET).cast() } + unsafe { seq_ptr.byte_add(D::DATA_OFFSET) } } /// Allocates when there is not sufficient capacity. @@ -160,16 +162,17 @@ impl JsStringBuilder { let old_layout = unsafe { self.current_layout() }; // SAFETY: // Valid pointer is required by `realloc` and pointer is checked above to be valid. - // The layout size of `RawJsString` is never zero, since it has to store + // The layout size of the sequence string is never zero, since it has to store // the length of the string and the reference count. unsafe { realloc(old_ptr.cast(), old_layout, new_layout.size()) } } else { // SAFETY: - // The layout size of `RawJsString` is never zero, since it has to store + // The layout size of the sequence string is never zero, since it has to store // the length of the string and the reference count. unsafe { alloc(new_layout) } }; - let Some(new_ptr) = NonNull::new(new_ptr.cast::()) else { + + let Some(new_ptr) = NonNull::new(new_ptr.cast::>()) else { std::alloc::handle_alloc_error(new_layout) }; self.inner = new_ptr; @@ -178,7 +181,7 @@ impl JsStringBuilder { /// Appends an element to the inner `RawJsString` of `JsStringBuilder`. #[inline] - pub fn push(&mut self, v: D) { + pub fn push(&mut self, v: D::Char) { let required_cap = self.len() + 1; self.allocate_if_needed(required_cap); // SAFETY: @@ -198,7 +201,7 @@ impl JsStringBuilder { /// /// Caller should ensure the capacity is large enough to hold elements. #[inline] - pub const unsafe fn extend_from_slice_unchecked(&mut self, v: &[D]) { + pub const unsafe fn extend_from_slice_unchecked(&mut self, v: &[D::Char]) { // SAFETY: Caller should ensure the capacity is large enough to hold elements. unsafe { ptr::copy_nonoverlapping(v.as_ptr(), self.data().add(self.len()), v.len()); @@ -208,7 +211,7 @@ impl JsStringBuilder { /// Pushes elements from slice to `JsStringBuilder`. #[inline] - pub fn extend_from_slice(&mut self, v: &[D]) { + pub fn extend_from_slice(&mut self, v: &[D::Char]) { let required_cap = self.len() + v.len(); self.allocate_if_needed(required_cap); // SAFETY: @@ -219,13 +222,13 @@ impl JsStringBuilder { } fn new_layout(cap: usize) -> Layout { - let new_layout = Layout::array::(cap) - .and_then(|arr| Layout::new::().extend(arr)) + let new_layout = Layout::array::(cap) + .and_then(|arr| Layout::new::>().extend(arr)) .map(|(layout, offset)| (layout.pad_to_align(), offset)) .map_err(|_| None); match new_layout { Ok((new_layout, offset)) => { - debug_assert_eq!(offset, DATA_OFFSET); + debug_assert_eq!(offset, D::DATA_OFFSET); new_layout } Err(None) => alloc_overflow(), @@ -287,7 +290,7 @@ impl JsStringBuilder { /// /// Caller should ensure the capacity is large enough to hold elements. #[inline] - pub const unsafe fn push_unchecked(&mut self, v: D) { + pub const unsafe fn push_unchecked(&mut self, v: D::Char) { // SAFETY: Caller should ensure the capacity is large enough to hold elements. unsafe { self.data().add(self.len()).write(v); @@ -318,7 +321,7 @@ impl JsStringBuilder { /// Extracts a slice containing the elements in the inner `RawJsString`. #[inline] #[must_use] - pub fn as_slice(&self) -> &[D] { + pub fn as_slice(&self) -> &[D::Char] { if self.is_allocated() { // SAFETY: // The inner `RawJsString` is allocated which means it is not null. @@ -335,7 +338,7 @@ impl JsStringBuilder { /// Use of a builder whose contents are not valid encoding is undefined behavior. #[inline] #[must_use] - pub unsafe fn as_mut_slice(&mut self) -> &mut [D] { + pub unsafe fn as_mut_slice(&mut self) -> &mut [D::Char] { if self.is_allocated() { // SAFETY: // The inner `RawJsString` is allocated which means it is not null. @@ -348,7 +351,7 @@ impl JsStringBuilder { /// Builds `JsString` from `JsStringBuilder` #[inline] #[must_use] - fn build_inner(mut self, latin1: bool) -> JsString { + fn build_inner(mut self) -> JsString { if self.is_empty() { return JsString::default(); } @@ -366,18 +369,18 @@ impl JsStringBuilder { // `NonNull` verified for us that the pointer returned by `alloc` is valid, // meaning we can write to its pointed memory. unsafe { - inner.as_ptr().write(SequenceString::new(len, latin1)); + inner.as_ptr().write(SequenceString::::new(len)); } // Tell the compiler not to call the destructor of `JsStringBuilder`, - // because we move inner `RawJsString` to `JsString`. + // because we move inner sequence string to `JsString`. std::mem::forget(self); JsString { ptr: inner.cast() } } } -impl Drop for JsStringBuilder { +impl Drop for JsStringBuilder { /// Set cold since [`JsStringBuilder`] should be created to build `JsString` #[cold] #[inline] @@ -397,21 +400,21 @@ impl Drop for JsStringBuilder { } } -impl AddAssign<&JsStringBuilder> for JsStringBuilder { +impl AddAssign<&JsStringBuilder> for JsStringBuilder { #[inline] fn add_assign(&mut self, rhs: &JsStringBuilder) { self.extend_from_slice(rhs.as_slice()); } } -impl AddAssign<&[D]> for JsStringBuilder { +impl AddAssign<&[D::Char]> for JsStringBuilder { #[inline] - fn add_assign(&mut self, rhs: &[D]) { + fn add_assign(&mut self, rhs: &[D::Char]) { self.extend_from_slice(rhs); } } -impl Add<&JsStringBuilder> for JsStringBuilder { +impl Add<&JsStringBuilder> for JsStringBuilder { type Output = Self; #[inline] @@ -421,19 +424,19 @@ impl Add<&JsStringBuilder> for JsStringBuilder { } } -impl Add<&[D]> for JsStringBuilder { +impl Add<&[D::Char]> for JsStringBuilder { type Output = Self; #[inline] - fn add(mut self, rhs: &[D]) -> Self::Output { + fn add(mut self, rhs: &[D::Char]) -> Self::Output { self.extend_from_slice(rhs); self } } -impl Extend for JsStringBuilder { +impl Extend for JsStringBuilder { #[inline] - fn extend>(&mut self, iter: I) { + fn extend>(&mut self, iter: I) { let iterator = iter.into_iter(); let (lower_bound, _) = iterator.size_hint(); let require_cap = self.len() + lower_bound; @@ -442,18 +445,18 @@ impl Extend for JsStringBuilder { } } -impl FromIterator for JsStringBuilder { +impl FromIterator for JsStringBuilder { #[inline] - fn from_iter>(iter: T) -> Self { + fn from_iter>(iter: T) -> Self { let mut builder = Self::new(); builder.extend(iter); builder } } -impl From<&[D]> for JsStringBuilder { +impl From<&[D::Char]> for JsStringBuilder { #[inline] - fn from(value: &[D]) -> Self { + fn from(value: &[D::Char]) -> Self { let mut builder = Self::with_capacity(value.len()); // SAFETY: The capacity is large enough to hold elements. unsafe { builder.extend_from_slice_unchecked(value) }; @@ -461,14 +464,19 @@ impl From<&[D]> for JsStringBuilder { } } -impl PartialEq for JsStringBuilder { +impl PartialEq for JsStringBuilder +where + D::Char: Eq + PartialEq, +{ #[inline] fn eq(&self, other: &Self) -> bool { - self.as_slice().eq(other.as_slice()) + let slice: &[D::Char] = self.as_slice(); + let other_slice: &[D::Char] = other.as_slice(); + slice.eq(other_slice) } } -impl Clone for JsStringBuilder { +impl Clone for JsStringBuilder { #[inline] fn clone(&self) -> Self { if self.is_allocated() { @@ -491,7 +499,7 @@ impl Clone for JsStringBuilder { if source_len > self.capacity() { self.allocate(source_len); } else { - // At this point, inner `RawJsString` of self or source can be not allocated, + // At this point, inner sequence string of self or source can be not allocated, // returns earlier to avoid copying from/to `null`. if source_len == 0 { // SAFETY: 0 is always less or equal to self's capacity. @@ -528,7 +536,7 @@ impl Clone for JsStringBuilder { /// s.extend([b'1', b'2', b'3']); /// let js_string = s.build(); /// ``` -pub type Latin1JsStringBuilder = JsStringBuilder; +pub type Latin1JsStringBuilder = JsStringBuilder; impl Latin1JsStringBuilder { /// Builds a `JsString` if the current instance is strictly `ASCII`. @@ -544,7 +552,7 @@ impl Latin1JsStringBuilder { #[must_use] pub fn build(self) -> Option { if self.is_ascii() { - Some(self.build_inner(true)) + Some(self.build_inner()) } else { None } @@ -562,7 +570,7 @@ impl Latin1JsStringBuilder { #[inline] #[must_use] pub unsafe fn build_as_latin1(self) -> JsString { - self.build_inner(true) + self.build_inner() } } @@ -577,14 +585,14 @@ impl Latin1JsStringBuilder { /// s.extend([0xD83C, 0xDFB9, 0xD83C, 0xDFB6, 0xD83C, 0xDFB5]); // 🎹🎢🎡 /// let js_string = s.build(); /// ``` -pub type Utf16JsStringBuilder = JsStringBuilder; +pub type Utf16JsStringBuilder = JsStringBuilder; impl Utf16JsStringBuilder { /// Builds `JsString` from `Utf16JsStringBuilder` #[inline] #[must_use] pub fn build(self) -> JsString { - self.build_inner(false) + self.build_inner() } } diff --git a/core/string/src/display.rs b/core/string/src/display.rs index 2586c8342be..fac511b4649 100644 --- a/core/string/src/display.rs +++ b/core/string/src/display.rs @@ -90,7 +90,7 @@ impl fmt::Debug for JsStringDebugInfo<'_> { // Show kind specific fields from string. match self.inner.kind() { - JsStringKind::Sequence => { + JsStringKind::Latin1Sequence | JsStringKind::Utf16Sequence => { if let Some(rc) = self.inner.refcount() { dbg.borrow_mut().field("refcount", &rc); } diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 2343e744a62..b77ac5e2463 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -18,6 +18,7 @@ mod common; mod display; mod iter; mod str; +mod r#type; mod vtable; #[cfg(test)] @@ -25,6 +26,7 @@ mod tests; use self::{iter::Windows, str::JsSliceIndex}; use crate::display::{JsStrDisplayEscaped, JsStrDisplayLossy, JsStringDebugInfo}; +use crate::r#type::{Latin1, Utf16}; pub use crate::vtable::StaticString; use crate::vtable::{SequenceString, SliceString}; #[doc(inline)] @@ -36,14 +38,13 @@ pub use crate::{ str::{JsStr, JsStrVariant}, }; use std::marker::PhantomData; +use std::{borrow::Cow, mem::ManuallyDrop}; use std::{ - alloc::{Layout, alloc}, convert::Infallible, hash::{Hash, Hasher}, ptr::{self, NonNull}, str::FromStr, }; -use std::{borrow::Cow, mem::ManuallyDrop}; use vtable::JsStringVTable; fn alloc_overflow() -> ! { @@ -96,47 +97,22 @@ pub struct RawJsString { phantom_data: PhantomData<*mut ()>, } -/// A `usize` contains a flag and the length of Latin1/UTF-16 . -/// ```text -/// β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -/// β”‚ length (usize::BITS - 1) β”‚ flag(1) β”‚ -/// β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -/// ``` -/// The latin1/UTF-16 flag is stored in the bottom bit. -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] -#[repr(transparent)] -struct TaggedLen(usize); - -impl TaggedLen { - const LATIN1_BITFLAG: usize = 1 << 0; - const BITFLAG_COUNT: usize = 1; - - const fn new(len: usize, latin1: bool) -> Self { - Self((len << Self::BITFLAG_COUNT) | (latin1 as usize)) - } - - const fn is_latin1(self) -> bool { - (self.0 & Self::LATIN1_BITFLAG) != 0 - } - - const fn len(self) -> usize { - self.0 >> Self::BITFLAG_COUNT - } -} - /// Strings can be represented internally by multiple kinds. This is used to identify /// the storage kind of string. #[derive(Debug, Clone, Copy, Eq, PartialEq)] #[repr(u8)] pub(crate) enum JsStringKind { - /// A sequential memory slice of either UTF-8 or UTF-16. See [`SequenceString`]. - Sequence = 0, + /// A sequential memory slice of Latin1 bytes. See [`SequenceString`]. + Latin1Sequence = 0, + + /// A sequential memory slice of UTF-16 code units. See [`SequenceString`]. + Utf16Sequence = 1, /// A slice of an existing string. See [`SliceString`]. - Slice = 1, + Slice = 2, /// A static string that is valid for `'static` lifetime. - Static = 2, + Static = 3, } /// A Latin1 or UTF-16–encoded, reference counted, immutable string. @@ -587,13 +563,19 @@ impl JsString { full_count = sum; } - let ptr = Self::allocate_seq(full_count, latin1_encoding); + let (ptr, data_offset) = if latin1_encoding { + let p = SequenceString::::allocate(full_count); + (p.cast::(), size_of::>()) + } else { + let p = SequenceString::::allocate(full_count); + (p.cast::(), size_of::>()) + }; let string = { - // SAFETY: `allocate_seq` guarantees that `ptr` is a valid pointer to a `SequenceString`. + // SAFETY: `allocate_*_seq` guarantees that `ptr` is a valid pointer to a sequence string. let mut data = unsafe { - let seq_ptr = ptr.as_ptr().cast::(); - seq_ptr.add(size_of::()) + let seq_ptr = ptr.as_ptr(); + seq_ptr.add(data_offset) }; for &string in strings { // SAFETY: @@ -640,112 +622,38 @@ impl JsString { StaticJsStrings::get_string(&string.as_str()).unwrap_or(string) } - /// Allocates a new [`SequenceString`] with an internal capacity of `str_len` chars. - /// - /// # Panics - /// - /// Panics if `try_allocate_seq` returns `Err`. - fn allocate_seq(str_len: usize, latin1: bool) -> NonNull { - match Self::try_allocate_seq(str_len, latin1) { - Ok(v) => v, - Err(None) => alloc_overflow(), - Err(Some(layout)) => std::alloc::handle_alloc_error(layout), - } - } - - // This is marked as safe because it is always valid to call this function to request any number - // of `u16`, since this function ought to fail on an OOM error. - /// Allocates a new [`SequenceString`] with an internal capacity of `str_len` chars. - /// - /// # Errors - /// - /// Returns `Err(None)` on integer overflows `usize::MAX`. - /// Returns `Err(Some(Layout))` on allocation error. - fn try_allocate_seq( - str_len: usize, - latin1: bool, - ) -> Result, Option> { - let (layout, offset) = if latin1 { - Layout::array::(str_len) - } else { - Layout::array::(str_len) - } - .and_then(|arr| Layout::new::().extend(arr)) - .map(|(layout, offset)| (layout.pad_to_align(), offset)) - .map_err(|_| None)?; - - debug_assert_eq!(offset, vtable::sequence::DATA_OFFSET); - debug_assert_eq!(layout.align(), align_of::()); - - #[allow(clippy::cast_ptr_alignment)] - // SAFETY: - // The layout size of `SequenceString` is never zero, since it has to store - // the length of the string and the reference count. - let inner = unsafe { alloc(layout).cast::() }; - - // We need to verify that the pointer returned by `alloc` is not null, otherwise - // we should abort, since an allocation error is pretty unrecoverable for us - // right now. - let inner = NonNull::new(inner).ok_or(Some(layout))?; - - // SAFETY: - // `NonNull` verified for us that the pointer returned by `alloc` is valid, - // meaning we can write to its pointed memory. - unsafe { - // Write the first part, the `SequenceString`. - inner.as_ptr().write(SequenceString::new(str_len, latin1)); - } - - debug_assert!({ - let inner = inner.as_ptr(); - // SAFETY: - // - `inner` must be a valid pointer, since it comes from a `NonNull`, - // meaning we can safely dereference it to `SequenceString`. - // - `offset` should point us to the beginning of the array, - // and since we requested a `SequenceString` layout with a trailing - // `[u16; str_len]`, the memory of the array must be in the `usize` - // range for the allocation to succeed. - unsafe { - ptr::eq( - inner.cast::().add(offset).cast(), - (*inner).data().cast_mut(), - ) - } - }); - - Ok(inner) - } - /// Creates a new [`JsString`] from `data`, without checking if the string is in the interner. fn from_slice_skip_interning(string: JsStr<'_>) -> Self { let count = string.len(); - let ptr = Self::allocate_seq(count, string.is_latin1()); - - // SAFETY: `allocate_seq` guarantees that `ptr` is a valid pointer. - let data = unsafe { (&raw mut (*ptr.as_ptr()).data).cast::() }; // SAFETY: // - We read `count = data.len()` elements from `data`, which is within the bounds of the slice. - // - `allocate_seq` must allocate at least `count` elements, which allows us to safely + // - `allocate_*_seq` must allocate at least `count` elements, which allows us to safely // write at least `count` elements. - // - `allocate_seq` should already take care of the alignment of `ptr`, and `data` must be + // - `allocate_*_seq` should already take care of the alignment of `ptr`, and `data` must be // aligned to be a valid slice. - // - `allocate_seq` must return a valid pointer to newly allocated memory, meaning `ptr` + // - `allocate_*_seq` must return a valid pointer to newly allocated memory, meaning `ptr` // and `data` should never overlap. unsafe { // NOTE: The alignment is checked when we allocate the array. #[allow(clippy::cast_ptr_alignment)] match string.variant() { JsStrVariant::Latin1(s) => { - ptr::copy_nonoverlapping(s.as_ptr(), data.cast::(), count); + let ptr = SequenceString::::allocate(count); + let data = (&raw mut (*ptr.as_ptr()).data) + .cast::<::Byte>(); + ptr::copy_nonoverlapping(s.as_ptr(), data, count); + Self { ptr: ptr.cast() } } JsStrVariant::Utf16(s) => { - ptr::copy_nonoverlapping(s.as_ptr(), data.cast::(), count); + let ptr = SequenceString::::allocate(count); + let data = (&raw mut (*ptr.as_ptr()).data) + .cast::<::Byte>(); + ptr::copy_nonoverlapping(s.as_ptr(), data, count); + Self { ptr: ptr.cast() } } } } - - Self { ptr: ptr.cast() } } /// Creates a new [`JsString`] from `data`. diff --git a/core/string/src/str.rs b/core/string/src/str.rs index a766767aebf..3164e84ccd0 100644 --- a/core/string/src/str.rs +++ b/core/string/src/str.rs @@ -1,13 +1,12 @@ use super::iter::{CodePointsIter, Windows}; use crate::{ - CodePoint, Iter, TaggedLen, + CodePoint, Iter, display::{JsStrDisplayEscaped, JsStrDisplayLossy}, is_trimmable_whitespace, is_trimmable_whitespace_latin1, }; use std::ptr::NonNull; use std::{ hash::{Hash, Hasher}, - marker::PhantomData, slice::SliceIndex, }; @@ -51,18 +50,11 @@ pub enum JsStrVariant<'a> { Utf16(&'a [u16]), } -#[derive(Clone, Copy)] -struct Inner<'a> { - tagged_len: TaggedLen, - ptr: *const u8, - _marker: PhantomData<&'a [u8]>, -} - /// This is equivalent to Rust's `&str`. #[derive(Clone, Copy)] #[repr(align(8))] pub struct JsStr<'a> { - inner: Inner<'a>, + inner: JsStrVariant<'a>, } // SAFETY: Inner<'_> has only immutable references to Sync types (u8/u16), so this is safe. @@ -81,11 +73,7 @@ impl<'a> JsStr<'a> { #[must_use] pub const fn latin1(value: &'a [u8]) -> Self { Self { - inner: Inner { - tagged_len: TaggedLen::new(value.len(), true), - ptr: value.as_ptr(), - _marker: PhantomData, - }, + inner: JsStrVariant::Latin1(value), } } @@ -94,11 +82,7 @@ impl<'a> JsStr<'a> { #[must_use] pub const fn utf16(value: &'a [u16]) -> Self { Self { - inner: Inner { - tagged_len: TaggedLen::new(value.len(), false), - ptr: value.as_ptr().cast::(), - _marker: PhantomData, - }, + inner: JsStrVariant::Utf16(value), } } @@ -106,60 +90,44 @@ impl<'a> JsStr<'a> { #[inline] #[must_use] pub const fn len(&self) -> usize { - self.inner.tagged_len.len() + match &self.inner { + JsStrVariant::Latin1(value) => value.len(), + JsStrVariant::Utf16(value) => value.len(), + } } /// Return the inner [`JsStrVariant`] variant of the [`JsStr`]. #[inline] #[must_use] pub const fn variant(self) -> JsStrVariant<'a> { - let len = self.inner.tagged_len.len(); - - if self.inner.tagged_len.is_latin1() { - // SAFETY: We check that the ptr points to a latin1 (i.e. &[u8]), so this is safe. - let slice = unsafe { std::slice::from_raw_parts(self.inner.ptr, len) }; - - JsStrVariant::Latin1(slice) - } else { - // SAFETY: Non-latin1 ptr always points to a valid &[u16] slice, so this is safe. - #[allow(clippy::cast_ptr_alignment)] - let ptr = self.inner.ptr.cast::(); - - // SAFETY: We check that the ptr points to an utf16 slice, so this is safe. - let slice = unsafe { std::slice::from_raw_parts(ptr, len) }; - - JsStrVariant::Utf16(slice) - } + self.inner } /// Returns a pointer to the start of the data. #[inline] #[must_use] pub(crate) const fn as_ptr(&self) -> NonNull { - // SAFETY: If this object is created, this should never be null. - unsafe { NonNull::new_unchecked(self.inner.ptr.cast_mut()) } + match self.inner { + JsStrVariant::Latin1(value) => NonNull::from_ref(&value[0]), + JsStrVariant::Utf16(value) => NonNull::from_ref(&value[0]).cast(), + } } /// Check if the [`JsStr`] is latin1 encoded. #[inline] #[must_use] pub const fn is_latin1(&self) -> bool { - self.inner.tagged_len.is_latin1() + matches!(self.inner, JsStrVariant::Latin1(_)) } /// Returns [`u8`] slice if the [`JsStr`] is latin1 encoded, otherwise [`None`]. #[inline] #[must_use] pub const fn as_latin1(&self) -> Option<&[u8]> { - if self.is_latin1() { - let len = self.inner.tagged_len.len(); - - // SAFETY: ptr is always a valid pointer to a slice data. - let slice = unsafe { std::slice::from_raw_parts(self.inner.ptr, len) }; - return Some(slice); + match self.inner { + JsStrVariant::Latin1(value) => Some(value), + JsStrVariant::Utf16(_) => None, } - - None } /// Iterate over the codepoints of the string. diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index 3c3cef017d7..0227c2cb275 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -496,7 +496,7 @@ fn code_points_optimization() { fn slice() { let sliced = { let base_str = JsString::from("Hello World"); - assert_eq!(base_str.kind(), JsStringKind::Sequence); + assert_eq!(base_str.kind(), JsStringKind::Latin1Sequence); base_str.slice(1, 5) }; @@ -532,7 +532,7 @@ fn slice() { #[test] fn split() { let base_str = JsString::from("Hello World"); - assert_eq!(base_str.kind(), JsStringKind::Sequence); + assert_eq!(base_str.kind(), JsStringKind::Latin1Sequence); let str1 = base_str.slice(0, 5); let str2 = base_str.slice(6, base_str.len()); diff --git a/core/string/src/type.rs b/core/string/src/type.rs new file mode 100644 index 00000000000..c924c75f945 --- /dev/null +++ b/core/string/src/type.rs @@ -0,0 +1,90 @@ +//! Module containing string types public and crate-specific. +use crate::vtable::SequenceString; +use crate::{JsStr, JsStringKind}; +use std::alloc::Layout; + +pub(crate) mod sealed { + use crate::{JsStr, JsStringKind}; + use std::alloc::Layout; + + /// Seal to prevent others from implementing their own string types. + pub trait Sealed {} + + /// Internal trait for crate-specific usage. Contains implementation details + /// that should not leak through the API. + #[allow(private_interfaces)] + pub trait InternalStringType { + /// The offset to the data field in the sequence string struct. + const DATA_OFFSET: usize; + + /// The kind of string produced by this string type. + const KIND: JsStringKind; + + /// The type of one character for this string type. + type Byte: Copy + Eq + 'static; + + /// Create the base layout for the sequence string header. + fn base_layout() -> Layout; + + /// Construct a [`JsStr`] from a slice of characters. + fn str_ctor(slice: &[Self::Byte]) -> JsStr<'_>; + } +} +use sealed::{InternalStringType, Sealed}; + +/// Trait that maps the data type to the appropriate internal types and constants. +pub trait StringType: InternalStringType + Sealed { + type Char: Copy + Eq + 'static; +} + +// It is good defensive programming to have [`Latin1`] `!Copy`, as it should +// not be used as a value anyway. +#[allow(missing_copy_implementations)] +#[derive(Debug)] +pub enum Latin1 {} + +impl Sealed for Latin1 {} +impl StringType for Latin1 { + type Char = u8; +} + +#[allow(private_interfaces)] +impl InternalStringType for Latin1 { + const DATA_OFFSET: usize = size_of::>(); + const KIND: JsStringKind = JsStringKind::Latin1Sequence; + type Byte = u8; + + fn base_layout() -> Layout { + Layout::new::>() + } + + fn str_ctor(slice: &[Self::Byte]) -> JsStr<'_> { + JsStr::latin1(slice) + } +} + +// It is good defensive programming to have [`Utf16`] `!Copy`, as it should +// not be used as a value anyway. +#[allow(missing_copy_implementations)] +#[derive(Debug)] +pub enum Utf16 {} + +impl Sealed for Utf16 {} +impl StringType for Utf16 { + type Char = u16; +} + +#[allow(private_interfaces)] +impl InternalStringType for Utf16 { + const DATA_OFFSET: usize = size_of::>(); + const KIND: JsStringKind = JsStringKind::Utf16Sequence; + type Byte = u16; + + fn base_layout() -> Layout { + Layout::new::>() + } + + fn str_ctor(slice: &[Self::Byte]) -> JsStr<'_> { + JsStr::utf16(slice) + } +} diff --git a/core/string/src/vtable/mod.rs b/core/string/src/vtable/mod.rs index adb446b14d4..f3979320dc8 100644 --- a/core/string/src/vtable/mod.rs +++ b/core/string/src/vtable/mod.rs @@ -2,7 +2,7 @@ use crate::{JsStr, JsString, JsStringKind}; use std::ptr::NonNull; -pub(crate) mod sequence; +mod sequence; pub(crate) use sequence::SequenceString; pub(crate) mod slice; diff --git a/core/string/src/vtable/sequence.rs b/core/string/src/vtable/sequence.rs index a2ee19f9cd5..8abbdbdff14 100644 --- a/core/string/src/vtable/sequence.rs +++ b/core/string/src/vtable/sequence.rs @@ -1,45 +1,119 @@ //! `VTable` implementations for [`SequenceString`]. +use crate::r#type::StringType; use crate::vtable::JsStringVTable; -use crate::{JsStr, JsString, JsStringKind, TaggedLen}; -use std::alloc::{Layout, dealloc}; +use crate::{JsStr, JsString, alloc_overflow}; +use std::alloc::{Layout, alloc, dealloc}; use std::cell::Cell; +use std::marker::PhantomData; use std::process::abort; +use std::ptr; use std::ptr::NonNull; -pub(crate) const DATA_OFFSET: usize = size_of::(); - -/// A sequential memory array of strings. +/// A sequential memory array of `T::Char` elements. +/// +/// # Notes +/// A [`SequenceString`] is `!Sync` (using [`Cell`]) and invariant over `T` (strings +/// of various types cannot be used interchangeably). The string, however, could be +/// `Send`, although within Boa this does not make sense. #[repr(C)] -pub(crate) struct SequenceString { - /// Embedded `VTable` - must be first field for vtable dispatch. +pub(crate) struct SequenceString { + /// Embedded `VTable` - must be the first field for vtable dispatch. vtable: JsStringVTable, - tagged_len: TaggedLen, refcount: Cell, + // Forces invariant contract. + _marker: PhantomData T>, pub(crate) data: [u8; 0], } -impl SequenceString { - /// Creates a dummy [`SequenceString - /// `]. This should only be used to write to +impl SequenceString { + /// Creates a [`SequenceString`] without data. This should only be used to write to /// an allocation which contains all the information. #[inline] #[must_use] - pub(crate) fn new(len: usize, is_latin1: bool) -> Self { + pub(crate) fn new(len: usize) -> Self { SequenceString { vtable: JsStringVTable { - clone: seq_clone, - drop: seq_drop, - as_str: seq_as_str, - refcount: seq_refcount, + clone: seq_clone::, + drop: seq_drop::, + as_str: seq_as_str::, + refcount: seq_refcount::, len, - kind: JsStringKind::Sequence, + kind: T::KIND, }, - tagged_len: TaggedLen::new(len, is_latin1), refcount: Cell::new(1), + _marker: PhantomData, data: [0; 0], } } + /// Allocates a new [`SequenceString`] with an internal capacity of `len` characters. + /// + /// # Panics + /// + /// Panics if `try_allocate_seq` returns `Err`. + pub(crate) fn allocate(len: usize) -> NonNull> { + match Self::try_allocate(len) { + Ok(v) => v, + Err(None) => alloc_overflow(), + Err(Some(layout)) => std::alloc::handle_alloc_error(layout), + } + } + + /// Allocates a new [`SequenceString`] with an internal capacity of `len` characters. + /// + /// # Errors + /// + /// Returns `Err(None)` on integer overflows `usize::MAX`. + /// Returns `Err(Some(Layout))` on allocation error. + pub(crate) fn try_allocate(len: usize) -> Result, Option> { + let (layout, offset) = Layout::array::(len) + .and_then(|arr| T::base_layout().extend(arr)) + .map(|(layout, offset)| (layout.pad_to_align(), offset)) + .map_err(|_| None)?; + + debug_assert_eq!(offset, T::DATA_OFFSET); + debug_assert_eq!(layout.align(), align_of::()); + + #[allow(clippy::cast_ptr_alignment)] + // SAFETY: + // The layout size of `SequenceString` is never zero, since it has to store + // the length of the string and the reference count. + let inner = unsafe { alloc(layout).cast::() }; + + // We need to verify that the pointer returned by `alloc` is not null, otherwise + // we should abort, since an allocation error is pretty unrecoverable for us + // right now. + let inner = NonNull::new(inner).ok_or(Some(layout))?; + + // SAFETY: + // `NonNull` verified for us that the pointer returned by `alloc` is valid, + // meaning we can write to its pointed memory. + unsafe { + // Write the first part, the `SequenceString`. + inner.as_ptr().write(Self::new(len)); + } + + debug_assert!({ + let inner = inner.as_ptr(); + // SAFETY: + // - `inner` must be a valid pointer, since it comes from a `NonNull`, + // meaning we can safely dereference it to `SequenceString`. + // - `offset` should point us to the beginning of the array, + // and since we requested a `SequenceString` layout with a trailing + // `[T::Byte; str_len]`, the memory of the array must be in the `usize` + // range for the allocation to succeed. + unsafe { + // This is `` as the offset is in bytes. + ptr::eq( + inner.cast::().add(offset).cast(), + (*inner).data().cast_mut(), + ) + } + }); + + Ok(inner) + } + /// Returns the pointer to the data. #[inline] #[must_use] @@ -48,9 +122,9 @@ impl SequenceString { } } -fn seq_clone(vtable: NonNull) -> JsString { +fn seq_clone(vtable: NonNull) -> JsString { // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SequenceString = unsafe { vtable.cast().as_ref() }; + let this: &SequenceString = unsafe { vtable.cast().as_ref() }; let Some(strong) = this.refcount.get().checked_add(1) else { abort(); }; @@ -59,9 +133,9 @@ fn seq_clone(vtable: NonNull) -> JsString { unsafe { JsString::from_ptr(vtable) } } -fn seq_drop(vtable: NonNull) { +fn seq_drop(vtable: NonNull) { // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SequenceString = unsafe { vtable.cast().as_ref() }; + let this: &SequenceString = unsafe { vtable.cast().as_ref() }; let Some(new) = this.refcount.get().checked_sub(1) else { abort(); }; @@ -72,50 +146,34 @@ fn seq_drop(vtable: NonNull) { // SAFETY: All the checks for the validity of the layout have already been made on allocation. let layout = unsafe { - if this.tagged_len.is_latin1() { - Layout::for_value(this) - .extend(Layout::array::(this.tagged_len.len()).unwrap_unchecked()) - .unwrap_unchecked() - .0 - .pad_to_align() - } else { - Layout::for_value(this) - .extend(Layout::array::(this.tagged_len.len()).unwrap_unchecked()) - .unwrap_unchecked() - .0 - .pad_to_align() - } + Layout::for_value(this) + .extend(Layout::array::(this.vtable.len).unwrap_unchecked()) + .unwrap_unchecked() + .0 + .pad_to_align() }; // SAFETY: If refcount is 0, this is the last reference, so deallocating is safe. unsafe { - dealloc(vtable.as_ptr().cast::(), layout); + dealloc(vtable.as_ptr().cast(), layout); } } -fn seq_as_str(vtable: NonNull) -> JsStr<'static> { +fn seq_as_str(vtable: NonNull) -> JsStr<'static> { // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SequenceString = unsafe { vtable.cast().as_ref() }; - let len = this.tagged_len.len(); - let is_latin1 = this.tagged_len.is_latin1(); - let data_ptr = (&raw const this.data).cast::(); + let this: &SequenceString = unsafe { vtable.cast().as_ref() }; + let len = this.vtable.len; + let data_ptr = (&raw const this.data).cast::(); - // SAFETY: SequenceString - // data is always valid and properly aligned. - unsafe { - if is_latin1 { - JsStr::latin1(std::slice::from_raw_parts(data_ptr, len)) - } else { - #[allow(clippy::cast_ptr_alignment)] - JsStr::utf16(std::slice::from_raw_parts(data_ptr.cast::(), len)) - } - } + // SAFETY: SequenceString data is always valid and properly aligned. + let slice = unsafe { std::slice::from_raw_parts(data_ptr, len) }; + T::str_ctor(slice) } /// `VTable` function for refcount, need to return an `Option`. #[allow(clippy::unnecessary_wraps)] -fn seq_refcount(vtable: NonNull) -> Option { +fn seq_refcount(vtable: NonNull) -> Option { // SAFETY: This is part of the correct vtable which is validated on construction. - let this: &SequenceString = unsafe { vtable.cast().as_ref() }; + let this: &SequenceString = unsafe { vtable.cast().as_ref() }; Some(this.refcount.get()) } diff --git a/core/string/src/vtable/slice.rs b/core/string/src/vtable/slice.rs index 720652b03ff..a468823378e 100644 --- a/core/string/src/vtable/slice.rs +++ b/core/string/src/vtable/slice.rs @@ -1,5 +1,5 @@ use crate::vtable::JsStringVTable; -use crate::{JsStr, JsString, JsStringKind, TaggedLen}; +use crate::{JsStr, JsString, JsStringKind}; use std::cell::Cell; use std::process::abort; use std::ptr::NonNull; @@ -14,8 +14,10 @@ pub(crate) struct SliceString { // Pointer to the data itself. This is guaranteed to be safe as long as `owned` is // owned. data: NonNull, - // Length (and latin1 tag) for this string. We drop start/end. - tagged_len: TaggedLen, + // Length of this string slice. + len: usize, + // Whether the string is Latin1 encoded. + is_latin1: bool, // Refcount for this string as we need to clone/drop it as well. refcount: Cell, } @@ -36,7 +38,8 @@ impl SliceString { }, owned: owned.clone(), data, - tagged_len: TaggedLen::new(len, is_latin1), + len, + is_latin1, refcount: Cell::new(1), } } @@ -81,8 +84,8 @@ fn slice_drop(vtable: NonNull) { fn slice_as_str(vtable: NonNull) -> JsStr<'static> { // SAFETY: This is part of the correct vtable which is validated on construction. let this: &SliceString = unsafe { vtable.cast().as_ref() }; - let len = this.tagged_len.len(); - let is_latin1 = this.tagged_len.is_latin1(); + let len = this.len; + let is_latin1 = this.is_latin1; let data_ptr = this.data.as_ptr(); // SAFETY: SliceString data points to valid memory owned by owned.