Skip to content

Commit ea8393a

Browse files
Add char::to_titlecase
1 parent af3c572 commit ea8393a

7 files changed

Lines changed: 377 additions & 76 deletions

File tree

library/core/src/char/methods.rs

Lines changed: 101 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,14 +1196,111 @@ impl char {
11961196
/// // convert into themselves.
11971197
/// assert_eq!('山'.to_lowercase().to_string(), "山");
11981198
/// ```
1199-
#[must_use = "this returns the lowercase character as a new iterator, \
1199+
#[must_use = "this returns the lowercased character as a new iterator, \
12001200
without modifying the original"]
12011201
#[stable(feature = "rust1", since = "1.0.0")]
12021202
#[inline]
12031203
pub fn to_lowercase(self) -> ToLowercase {
12041204
ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
12051205
}
12061206

1207+
/// Returns an iterator that yields the titlecase mapping of this `char` as one or more
1208+
/// `char`s.
1209+
///
1210+
/// If this `char` does not have an titlecase mapping, the iterator yields the same `char`.
1211+
///
1212+
/// If this `char` has a one-to-one titlecase mapping given by the [Unicode Character
1213+
/// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`.
1214+
///
1215+
/// [ucd]: https://www.unicode.org/reports/tr44/
1216+
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1217+
///
1218+
/// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
1219+
/// the `char`(s) given by [`SpecialCasing.txt`].
1220+
///
1221+
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
1222+
///
1223+
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
1224+
/// is independent of context and language.
1225+
///
1226+
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
1227+
/// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
1228+
///
1229+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
1230+
///
1231+
/// # Examples
1232+
///
1233+
/// As an iterator:
1234+
///
1235+
/// ```
1236+
/// #![feature(titlecase)]
1237+
/// for c in 'ß'.to_titlecase() {
1238+
/// print!("{c}");
1239+
/// }
1240+
/// println!();
1241+
/// ```
1242+
///
1243+
/// Using `println!` directly:
1244+
///
1245+
/// ```
1246+
/// #![feature(titlecase)]
1247+
/// println!("{}", 'ß'.to_titlecase());
1248+
/// ```
1249+
///
1250+
/// Both are equivalent to:
1251+
///
1252+
/// ```
1253+
/// println!("Ss");
1254+
/// ```
1255+
///
1256+
/// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string):
1257+
///
1258+
/// ```
1259+
/// #![feature(titlecase)]
1260+
/// assert_eq!('c'.to_titlecase().to_string(), "C");
1261+
///
1262+
/// // Sometimes the result is more than one character:
1263+
/// assert_eq!('ß'.to_titlecase().to_string(), "Ss");
1264+
///
1265+
/// // Characters that do not have separate cased forms
1266+
/// // convert into themselves.
1267+
/// assert_eq!('山'.to_titlecase().to_string(), "山");
1268+
/// ```
1269+
///
1270+
/// # Note on locale
1271+
///
1272+
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
1273+
///
1274+
/// * 'Dotless': I / ı, sometimes written ï
1275+
/// * 'Dotted': İ / i
1276+
///
1277+
/// Note that the lowercase dotted 'i' is the same as the Latin. Therefore:
1278+
///
1279+
/// ```
1280+
/// #![feature(titlecase)]
1281+
/// let upper_i = 'i'.to_titlecase().to_string();
1282+
/// ```
1283+
///
1284+
/// The value of `upper_i` here relies on the language of the text: if we're
1285+
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
1286+
/// be `"İ"`. `to_titlecase()` does not take this into account, and so:
1287+
///
1288+
/// ```
1289+
/// #![feature(titlecase)]
1290+
/// let upper_i = 'i'.to_titlecase().to_string();
1291+
///
1292+
/// assert_eq!(upper_i, "I");
1293+
/// ```
1294+
///
1295+
/// holds across languages.
1296+
#[must_use = "this returns the titlecased character as a new iterator, \
1297+
without modifying the original"]
1298+
#[unstable(feature = "titlecase", issue = "none")]
1299+
#[inline]
1300+
pub fn to_titlecase(self) -> ToTitlecase {
1301+
ToTitlecase(CaseMappingIter::new(conversions::to_title(self)))
1302+
}
1303+
12071304
/// Returns an iterator that yields the uppercase mapping of this `char` as one or more
12081305
/// `char`s.
12091306
///
@@ -1267,7 +1364,7 @@ impl char {
12671364
///
12681365
/// # Note on locale
12691366
///
1270-
/// In Turkish, the equivalent of 'i' in Latin has five forms instead of two:
1367+
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
12711368
///
12721369
/// * 'Dotless': I / ı, sometimes written ï
12731370
/// * 'Dotted': İ / i
@@ -1279,7 +1376,7 @@ impl char {
12791376
/// ```
12801377
///
12811378
/// The value of `upper_i` here relies on the language of the text: if we're
1282-
/// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should
1379+
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
12831380
/// be `"İ"`. `to_uppercase()` does not take this into account, and so:
12841381
///
12851382
/// ```
@@ -1289,7 +1386,7 @@ impl char {
12891386
/// ```
12901387
///
12911388
/// holds across languages.
1292-
#[must_use = "this returns the uppercase character as a new iterator, \
1389+
#[must_use = "this returns the uppercased character as a new iterator, \
12931390
without modifying the original"]
12941391
#[stable(feature = "rust1", since = "1.0.0")]
12951392
#[inline]

library/core/src/char/mod.rs

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -363,13 +363,21 @@ impl fmt::Display for EscapeDebug {
363363
}
364364

365365
macro_rules! casemappingiter_impls {
366-
($(#[$attr:meta])* $ITER_NAME:ident) => {
366+
(
367+
#[$stab:meta]
368+
#[$dendstab:meta]
369+
#[$fusedstab:meta]
370+
#[$exactstab:meta]
371+
#[$displaystab:meta]
372+
$(#[$attr:meta])*
373+
$ITER_NAME:ident
374+
) => {
367375
$(#[$attr])*
368-
#[stable(feature = "rust1", since = "1.0.0")]
376+
#[$stab]
369377
#[derive(Debug, Clone)]
370378
pub struct $ITER_NAME(CaseMappingIter);
371379

372-
#[stable(feature = "rust1", since = "1.0.0")]
380+
#[$stab]
373381
impl Iterator for $ITER_NAME {
374382
type Item = char;
375383
fn next(&mut self) -> Option<char> {
@@ -405,7 +413,7 @@ macro_rules! casemappingiter_impls {
405413
}
406414
}
407415

408-
#[stable(feature = "case_mapping_double_ended", since = "1.59.0")]
416+
#[$dendstab]
409417
impl DoubleEndedIterator for $ITER_NAME {
410418
fn next_back(&mut self) -> Option<char> {
411419
self.0.next_back()
@@ -423,10 +431,10 @@ macro_rules! casemappingiter_impls {
423431
}
424432
}
425433

426-
#[stable(feature = "fused", since = "1.26.0")]
434+
#[$fusedstab]
427435
impl FusedIterator for $ITER_NAME {}
428436

429-
#[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
437+
#[$exactstab]
430438
impl ExactSizeIterator for $ITER_NAME {
431439
fn len(&self) -> usize {
432440
self.0.len()
@@ -453,7 +461,7 @@ macro_rules! casemappingiter_impls {
453461
#[unstable(feature = "std_internals", issue = "none")]
454462
unsafe impl TrustedRandomAccess for $ITER_NAME {}
455463

456-
#[stable(feature = "char_struct_display", since = "1.16.0")]
464+
#[$displaystab]
457465
impl fmt::Display for $ITER_NAME {
458466
#[inline]
459467
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -464,23 +472,48 @@ macro_rules! casemappingiter_impls {
464472
}
465473

466474
casemappingiter_impls! {
467-
/// Returns an iterator that yields the lowercase equivalent of a `char`.
475+
#[stable(feature = "rust1", since = "1.0.0")]
476+
#[stable(feature = "case_mapping_double_ended", since = "1.59.0")]
477+
#[stable(feature = "fused", since = "1.26.0")]
478+
#[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
479+
#[stable(feature = "char_struct_display", since = "1.16.0")]
480+
/// Returns an iterator that yields the uppercase equivalent of a `char`.
468481
///
469-
/// This `struct` is created by the [`to_lowercase`] method on [`char`]. See
482+
/// This `struct` is created by the [`to_uppercase`] method on [`char`]. See
470483
/// its documentation for more.
471484
///
472-
/// [`to_lowercase`]: char::to_lowercase
473-
ToLowercase
485+
/// [`to_uppercase`]: char::to_uppercase
486+
ToUppercase
474487
}
475488

476489
casemappingiter_impls! {
477-
/// Returns an iterator that yields the uppercase equivalent of a `char`.
490+
#[unstable(feature = "titlecase", issue = "none")]
491+
#[unstable(feature = "titlecase", issue = "none")]
492+
#[unstable(feature = "titlecase", issue = "none")]
493+
#[unstable(feature = "titlecase", issue = "none")]
494+
#[unstable(feature = "titlecase", issue = "none")]
495+
/// Returns an iterator that yields the titlecase equivalent of a `char`.
478496
///
479-
/// This `struct` is created by the [`to_uppercase`] method on [`char`]. See
497+
/// This `struct` is created by the [`to_titlecase`] method on [`char`]. See
480498
/// its documentation for more.
481499
///
482-
/// [`to_uppercase`]: char::to_uppercase
483-
ToUppercase
500+
/// [`to_titlecase`]: char::to_titlecase
501+
ToTitlecase
502+
}
503+
504+
casemappingiter_impls! {
505+
#[stable(feature = "rust1", since = "1.0.0")]
506+
#[stable(feature = "case_mapping_double_ended", since = "1.59.0")]
507+
#[stable(feature = "fused", since = "1.26.0")]
508+
#[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
509+
#[stable(feature = "char_struct_display", since = "1.16.0")]
510+
/// Returns an iterator that yields the lowercase equivalent of a `char`.
511+
///
512+
/// This `struct` is created by the [`to_lowercase`] method on [`char`]. See
513+
/// its documentation for more.
514+
///
515+
/// [`to_lowercase`]: char::to_lowercase
516+
ToLowercase
484517
}
485518

486519
#[derive(Debug, Clone)]

library/core/src/unicode/unicode_data.rs

Lines changed: 67 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading
1010
// to_lower : 1112 bytes, 1462 codepoints in 185 ranges (U+0000C0 - U+01E921) using 2-level LUT
1111
// to_upper : 1998 bytes, 1554 codepoints in 299 ranges (U+0000B5 - U+01E943) using 2-level LUT
12-
// Total : 9657 bytes
12+
// to_title : 340 bytes, 135 codepoints in 49 ranges (U+0000DF - U+00FB17) using 2-level LUT
13+
// Total : 9997 bytes
1314

1415
#[inline(always)]
1516
const fn bitset_search<
@@ -823,14 +824,10 @@ pub mod conversions {
823824
unsafe { char::from_u32_unchecked(((plane as u32) << 16) | (low as u32)) }
824825
}
825826

826-
fn lookup(input: char, ascii: char, l1_lut: &L1Lut) -> [char; 3] {
827-
if input.is_ascii() {
828-
return [ascii, '\0', '\0'];
829-
}
830-
827+
fn lookup(input: char, l1_lut: &L1Lut) -> Option<[char; 3]> {
831828
let (input_high, input_low) = deconstruct(input);
832829
let Some(l2_lut) = l1_lut.l2_luts.get(input_high as usize) else {
833-
return [input, '\0', '\0'];
830+
return None;
834831
};
835832

836833
let idx = l2_lut.singles.binary_search_by(|(range, _)| {
@@ -844,6 +841,7 @@ pub mod conversions {
844841
Ordering::Equal
845842
}
846843
});
844+
847845
if let Ok(idx) = idx {
848846
// SAFETY: binary search guarantees that the index is in bounds.
849847
let &(range, output_delta) = unsafe { l2_lut.singles.get_unchecked(idx) };
@@ -852,7 +850,7 @@ pub mod conversions {
852850
let output_low = input_low.wrapping_add_signed(output_delta);
853851
// SAFETY: Table data are guaranteed to be valid Unicode.
854852
let output = unsafe { reconstruct(input_high, output_low) };
855-
return [output, '\0', '\0'];
853+
return Some([output, '\0', '\0']);
856854
}
857855
};
858856

@@ -861,18 +859,34 @@ pub mod conversions {
861859
let &(_, output_lows) = unsafe { l2_lut.multis.get_unchecked(idx) };
862860
// SAFETY: Table data are guaranteed to be valid Unicode.
863861
let output = output_lows.map(|output_low| unsafe { reconstruct(input_high, output_low) });
864-
return output;
862+
return Some(output);
865863
};
866864

867-
[input, '\0', '\0']
865+
None
868866
}
869867

870868
pub fn to_lower(c: char) -> [char; 3] {
871-
lookup(c, c.to_ascii_lowercase(), &LOWERCASE_LUT)
869+
if c.is_ascii() {
870+
return [c.to_ascii_lowercase(), '\0', '\0'];
871+
}
872+
873+
lookup(c, &LOWERCASE_LUT).unwrap_or([c, '\0', '\0'])
872874
}
873875

874876
pub fn to_upper(c: char) -> [char; 3] {
875-
lookup(c, c.to_ascii_uppercase(), &UPPERCASE_LUT)
877+
if c.is_ascii() {
878+
return [c.to_ascii_uppercase(), '\0', '\0'];
879+
}
880+
881+
lookup(c, &UPPERCASE_LUT).unwrap_or([c, '\0', '\0'])
882+
}
883+
884+
pub fn to_title(c: char) -> [char; 3] {
885+
if c.is_ascii() {
886+
return [c.to_ascii_uppercase(), '\0', '\0'];
887+
}
888+
889+
lookup(c, &TITLECASE_LUT).or_else(|| lookup(c, &UPPERCASE_LUT)).unwrap_or([c, '\0', '\0'])
876890
}
877891

878892
static LOWERCASE_LUT: L1Lut = L1Lut {
@@ -1150,4 +1164,45 @@ pub mod conversions {
11501164
},
11511165
],
11521166
};
1167+
1168+
static TITLECASE_LUT: L1Lut = L1Lut {
1169+
l2_luts: [
1170+
L2Lut {
1171+
singles: &[ // 26 entries, 156 bytes
1172+
(Range::singleton(0x01c4), 1), (Range::singleton(0x01c5), 0),
1173+
(Range::singleton(0x01c6), -1), (Range::singleton(0x01c7), 1),
1174+
(Range::singleton(0x01c8), 0), (Range::singleton(0x01c9), -1),
1175+
(Range::singleton(0x01ca), 1), (Range::singleton(0x01cb), 0),
1176+
(Range::singleton(0x01cc), -1), (Range::singleton(0x01f1), 1),
1177+
(Range::singleton(0x01f2), 0), (Range::singleton(0x01f3), -1),
1178+
(Range::step_by_1(0x10d0..=0x10fa), 0), (Range::step_by_1(0x10fd..=0x10ff), 0),
1179+
(Range::step_by_1(0x1f80..=0x1f87), 8), (Range::step_by_1(0x1f88..=0x1f8f), 0),
1180+
(Range::step_by_1(0x1f90..=0x1f97), 8), (Range::step_by_1(0x1f98..=0x1f9f), 0),
1181+
(Range::step_by_1(0x1fa0..=0x1fa7), 8), (Range::step_by_1(0x1fa8..=0x1faf), 0),
1182+
(Range::singleton(0x1fb3), 9), (Range::singleton(0x1fbc), 0), (Range::singleton(0x1fc3), 9),
1183+
(Range::singleton(0x1fcc), 0), (Range::singleton(0x1ff3), 9), (Range::singleton(0x1ffc), 0),
1184+
],
1185+
multis: &[ // 23 entries, 184 bytes
1186+
(0x00df, [0x0053, 0x0073, 0x0000]), (0x0587, [0x0535, 0x0582, 0x0000]),
1187+
(0x1fb2, [0x1fba, 0x0345, 0x0000]), (0x1fb4, [0x0386, 0x0345, 0x0000]),
1188+
(0x1fb7, [0x0391, 0x0342, 0x0345]), (0x1fc2, [0x1fca, 0x0345, 0x0000]),
1189+
(0x1fc4, [0x0389, 0x0345, 0x0000]), (0x1fc7, [0x0397, 0x0342, 0x0345]),
1190+
(0x1ff2, [0x1ffa, 0x0345, 0x0000]), (0x1ff4, [0x038f, 0x0345, 0x0000]),
1191+
(0x1ff7, [0x03a9, 0x0342, 0x0345]), (0xfb00, [0x0046, 0x0066, 0x0000]),
1192+
(0xfb01, [0x0046, 0x0069, 0x0000]), (0xfb02, [0x0046, 0x006c, 0x0000]),
1193+
(0xfb03, [0x0046, 0x0066, 0x0069]), (0xfb04, [0x0046, 0x0066, 0x006c]),
1194+
(0xfb05, [0x0053, 0x0074, 0x0000]), (0xfb06, [0x0053, 0x0074, 0x0000]),
1195+
(0xfb13, [0x0544, 0x0576, 0x0000]), (0xfb14, [0x0544, 0x0565, 0x0000]),
1196+
(0xfb15, [0x0544, 0x056b, 0x0000]), (0xfb16, [0x054e, 0x0576, 0x0000]),
1197+
(0xfb17, [0x0544, 0x056d, 0x0000]),
1198+
],
1199+
},
1200+
L2Lut {
1201+
singles: &[ // 0 entries, 0 bytes
1202+
],
1203+
multis: &[ // 0 entries, 0 bytes
1204+
],
1205+
},
1206+
],
1207+
};
11531208
}

0 commit comments

Comments
 (0)