Text API work, emoji segmentation.

This commit is contained in:
Samuel Guerra 2023-06-17 18:54:17 -03:00
parent 24db6ca231
commit 012f54e470
13 changed files with 8148 additions and 55 deletions

View File

@ -0,0 +1,21 @@
# Unicode Emoji
# © 2022 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see https://www.unicode.org/terms_of_use.html
This directory contains final data files for Unicode Emoji, Version 15.0
Public/emoji/15.0/
emoji-sequences.txt
emoji-zwj-sequences.txt
emoji-test.txt
The following related files are found in the UCD for Version 15.0
Public/15.0.0/ucd/emoji/
emoji-data.txt
emoji-variation-sequences.txt
For documentation, see UTS #51 Unicode Emoji, Version 15.0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -28,4 +28,8 @@ harness = false
[[test]]
name = "config"
path = "config.rs"
path = "config.rs"
[[test]]
name = "text"
path = "text.rs"

45
tests/text.rs Normal file
View File

@ -0,0 +1,45 @@
use zero_ui::core::{context::LayoutDirection, text::*};
use std::fmt::Write as _;
#[test]
fn emoji_segs() {
let tests = std::fs::read_to_string("../examples/res/text/unicode-emoji-15.0/emoji-test.txt").unwrap();
let mut errors = String::new();
let mut error_count = 0;
for line in tests.lines() {
if line.starts_with('#') || line.is_empty() {
continue;
}
let line = if let Some((_, test)) = line.split_once(';') {
if !test.starts_with(" fully-qualified") && !test.starts_with(" component") {
continue;
}
test
} else {
continue;
};
if let Some((_, test)) = line.split_once('#') {
let txt = SegmentedText::new(Txt::from_str(test), LayoutDirection::LTR);
let k: Vec<_> = txt.segs().iter().map(|s| s.kind).take(3).collect();
if k != vec![TextSegmentKind::Space, TextSegmentKind::Emoji, TextSegmentKind::Space] {
error_count += 1;
if error_count <= 20 {
let _ = writeln!(&mut errors, "{test}");
}
}
}
}
if !errors.is_empty() {
if error_count > 20 {
let _ = writeln!(&mut errors, "\n..and {} more errors", error_count - 20);
}
panic!("\n\n{errors}");
}
}

View File

@ -101,7 +101,8 @@ hyphenation = { version = "0.8", default-features = false }
regex = "1"
unicode-bidi = "0.3"
unicode-segmentation = "1"
unic-emoji-char = "0.9"
icu_properties = "1"
icu_testdata = "1"
# task
flume = { version = "0.10", default-features = false, features = ["async"] }

View File

@ -14,6 +14,7 @@ use std::{
sync::Arc,
};
mod emoji_util;
pub mod font_features;
mod font_kit_cache;
mod unicode_bidi_util;

View File

@ -0,0 +1,37 @@
use icu_properties::sets;
pub(super) fn maybe_emoji(c: char) -> bool {
sets::load_emoji(&icu_testdata::unstable()).unwrap().as_borrowed().contains(c)
}
pub(super) fn definitely_emoji(c: char) -> bool {
sets::load_emoji_presentation(&icu_testdata::unstable())
.unwrap()
.as_borrowed()
.contains(c)
|| is_modifier(c)
}
pub(super) fn is_modifier(c: char) -> bool {
sets::load_emoji_modifier(&icu_testdata::unstable())
.unwrap()
.as_borrowed()
.contains(c)
}
pub(super) fn is_component(c: char) -> bool {
sets::load_emoji_component(&icu_testdata::unstable())
.unwrap()
.as_borrowed()
.contains(c)
}
/*
Loaded data is !Send+!Sync so we probably don't need to cache it.
The "ucu_testdata" includes the stuff we need, plus a lot of useless data, there is a complicated way to
optmize this, but they are about to release embedded data, so we wait.
see: see https://github.com/unicode-org/icu4x/issues/3529
*/

View File

@ -1,6 +1,6 @@
use std::ops;
use crate::{context::LayoutDirection, crate_util::FxHashMap};
use crate::{context::LayoutDirection, crate_util::FxHashMap, text::emoji_util};
use super::Txt;
use unicode_bidi::BidiInfo;
@ -232,31 +232,43 @@ impl SegmentedText {
base_direction,
}
}
fn push_seg(text: &str, bidi: &BidiInfo, segs: &mut Vec<TextSegment>, end: usize) {
let start = segs.last().map(|s| s.end).unwrap_or(0);
let mut char_indices = text[start..end].char_indices().peekable();
let mut kind = TextSegmentKind::LeftToRight;
let mut level = BidiLevel::ltr();
for (i, c) in text[start..end].char_indices() {
let c_kind = if unic_emoji_char::is_emoji(c) {
TextSegmentKind::Emoji
} else {
const ZWJ: char = '\u{200D}'; // ZERO WIDTH JOINER
const VS0: char = '\u{FE00}'; // VARIANT SELECTOR 0
const VS16: char = '\u{FE0F}'; // VARIANT SELECTOR 16
if matches!(kind, TextSegmentKind::Emoji) && (c == ZWJ || (VS0..=VS16).contains(&c)) {
TextSegmentKind::Emoji
} else {
match TextSegmentKind::from(bidi.original_classes[start + i]) {
TextSegmentKind::OtherNeutral if super::unicode_bidi_util::bidi_bracket_data(c).is_some() => {
TextSegmentKind::Bracket(c)
}
k => k,
}
}
};
for (i, c) in &mut char_indices {
const ZWJ: char = '\u{200D}'; // ZERO WIDTH JOINER
const VS16: char = '\u{FE0F}'; // VARIANT SELECTOR 16 - Emoji
const CEK: char = '\u{20E3}'; // COMBINING ENCLOSING KEYCAP
let c_level = bidi.levels[start + i];
let is_emoji = (kind == TextSegmentKind::Emoji // maybe
&& (
c == VS16 // definitely, modifies prev. char into Emoji.
|| c == CEK // definitely, modified prev. char into keycap style.
|| c == ZWJ // definitely, ligature with the next Emoji or is ignored.
|| emoji_util::is_modifier(c) // definitely, has same effect as VS16.
|| emoji_util::is_component(c) // definitely, ligature data, like flag tags.
))
|| (emoji_util::maybe_emoji(c) // maybe
&& (emoji_util::definitely_emoji(c) // definitely
// only if followed by VS16 or modifier
|| (text[start+i..].chars().nth(1).map(|c| c == VS16 || emoji_util::is_modifier(c)).unwrap_or(false))));
let (c_kind, c_level) = if is_emoji {
(TextSegmentKind::Emoji, level)
} else {
let k = match TextSegmentKind::from(bidi.original_classes[start + i]) {
TextSegmentKind::OtherNeutral if super::unicode_bidi_util::bidi_bracket_data(c).is_some() => {
TextSegmentKind::Bracket(c)
}
k => k,
};
(k, bidi.levels[start + i])
};
if c_kind != kind || c_level != level || !c_kind.can_merge() {
if i > 0 {
@ -279,7 +291,7 @@ impl SegmentedText {
}
/// The raw segment data.
pub fn segments(&self) -> &[TextSegment] {
pub fn segs(&self) -> &[TextSegment] {
&self.segments
}
@ -676,13 +688,30 @@ mod tests {
#[test]
fn emoji_seg() {
let test = "'🙎🏻‍♀️'";
let test = "'🙎🏻‍♀️'1# 1⃣#️⃣";
let txt = SegmentedText::new(test, LayoutDirection::LTR);
let k: Vec<_> = txt.segments().iter().map(|s| s.kind).collect();
let k: Vec<_> = txt.segs().iter().map(|s| s.kind).collect();
assert_eq!(
vec![TextSegmentKind::OtherNeutral, TextSegmentKind::Emoji, TextSegmentKind::OtherNeutral],
vec![
TextSegmentKind::OtherNeutral, // '
TextSegmentKind::Emoji, // 🙎🏻‍♀️
TextSegmentKind::OtherNeutral, // '
TextSegmentKind::EuropeanNumber, // 1
TextSegmentKind::EuropeanTerminator, // #
TextSegmentKind::Space,
TextSegmentKind::Emoji, // 1⃣#️⃣
],
k
);
}
#[test]
fn emoji_issues() {
let test = "🏴󠁧󠁢󠁥󠁮󠁧󠁿";
let txt = SegmentedText::new(test, LayoutDirection::LTR);
for (t, seg) in txt.iter() {
assert_eq!(seg.kind, TextSegmentKind::Emoji, "text: {t:?}");
}
}
}

View File

@ -491,8 +491,12 @@ impl ShapedText {
let block_size = self.block_size();
let align_size = constraints.fill_size_or(block_size);
let mut first = PxRect::from_size(self.first_line().map(|l| l.rect().size).unwrap_or_default());
let mut last = PxRect::from_size(self.last_line().map(|l| l.rect().size).unwrap_or_default());
let mut first = PxRect::from_size(self.line(0).map(|l| l.rect().size).unwrap_or_default());
let mut last = PxRect::from_size(
self.line(self.lines_len().saturating_sub(1))
.map(|l| l.rect().size)
.unwrap_or_default(),
);
last.origin.y = block_size.height - last.size.height;
first.origin.x = (align_size.width - first.size.width) * align_x;
@ -774,23 +778,15 @@ impl ShapedText {
self.first_wrapped
}
/// Gets the first line, if the text contains any line.
pub fn first_line(&self) -> Option<ShapedLine> {
self.lines().next()
}
/// Gets the last line, if the text contains any line.
///
/// This is more efficient than `t.lines().last()`.
pub fn last_line(&self) -> Option<ShapedLine> {
if self.lines.0.is_empty() {
/// Gets the line by index.
pub fn line(&self, line_idx: usize) -> Option<ShapedLine> {
if self.lines.0.len() <= line_idx {
None
} else {
let last_line = self.lines.0.len() - 1;
self.lines.iter_segs_skip(last_line).next().map(move |(w, r)| ShapedLine {
self.lines.iter_segs_skip(line_idx).next().map(move |(w, r)| ShapedLine {
text: self,
seg_range: r,
index: last_line,
index: line_idx,
width: Px(w.round() as i32),
})
}
@ -919,7 +915,7 @@ impl ShapedText {
}
}
if let Some(line) = self.last_line() {
if let Some(line) = self.line(self.lines_len().saturating_sub(1)) {
// top-right of last line
let rect = line.rect();
PxPoint::new(rect.max_x(), rect.min_y())
@ -932,9 +928,9 @@ impl ShapedText {
pub fn nearest_line(&self, y: Px) -> Option<ShapedLine> {
let first_line_max_y = self.first_line.max_y();
if first_line_max_y >= y {
self.first_line()
self.line(0)
} else if self.last_line.min_y() <= y {
self.last_line()
self.line(self.lines_len().saturating_sub(1))
} else {
let y = y - first_line_max_y;
let line = (y / self.line_height()).0 as usize + 1;
@ -1705,6 +1701,21 @@ impl<'a> ShapedLine<'a> {
self.seg_range.len()
}
/// Get the segment by index.
///
/// The first segment of the line is `0`.
pub fn seg(&self, seg_idx: usize) -> Option<ShapedSegment> {
if self.seg_range.len() > seg_idx {
Some(ShapedSegment {
text: self.text,
line_index: self.index,
index: seg_idx + self.seg_range.start(),
})
} else {
None
}
}
/// Returns `true` if this line was started by the wrap algorithm.
///
/// If this is `false` then the line is the first or the previous line ends in a [`LineBreak`].

View File

@ -752,16 +752,19 @@ pub fn layout_text(child: impl UiNode) -> impl UiNode {
}
InlineConstraints::Layout(l) => {
if !self.pending.contains(PendingLayout::RESHAPE)
&& (Some(l.first_segs.len()) != r.shaped_text.first_line().map(|l| l.segs_len())
|| Some(l.last_segs.len()) != r.shaped_text.last_line().map(|l| l.segs_len()))
&& (Some(l.first_segs.len()) != r.shaped_text.line(0).map(|l| l.segs_len())
|| Some(l.last_segs.len())
!= r.shaped_text
.line(r.shaped_text.lines_len().saturating_sub(1))
.map(|l| l.segs_len()))
{
self.pending.insert(PendingLayout::RESHAPE);
}
if !self.pending.contains(PendingLayout::RESHAPE_LINES)
&& (r.shaped_text.mid_clear() != l.mid_clear
|| r.shaped_text.first_line().map(|l| l.rect()) != Some(l.first)
|| r.shaped_text.last_line().map(|l| l.rect()) != Some(l.last))
|| r.shaped_text.line(0).map(|l| l.rect()) != Some(l.first)
|| r.shaped_text.line(r.shaped_text.lines_len().saturating_sub(1)).map(|l| l.rect()) != Some(l.last))
{
self.pending.insert(PendingLayout::RESHAPE_LINES);
}
@ -1181,7 +1184,7 @@ pub fn layout_text(child: impl UiNode) -> impl UiNode {
let size = txt.layout(&metrics, &RESOLVED_TEXT.get(), true);
if let (Some(inline), Some(l)) = (wm.inline(), txt.txt.as_ref()) {
if let Some(first_line) = l.shaped_text.first_line() {
if let Some(first_line) = l.shaped_text.line(0) {
inline.first = first_line.original_size();
inline.with_first_segs(|i| {
for seg in first_line.segs() {
@ -1199,7 +1202,7 @@ pub fn layout_text(child: impl UiNode) -> impl UiNode {
if l.shaped_text.lines_len() == 1 {
inline.last = inline.first;
inline.last_segs = inline.first_segs.clone();
} else if let Some(last_line) = l.shaped_text.last_line() {
} else if let Some(last_line) = l.shaped_text.line(l.shaped_text.lines_len().saturating_sub(1)) {
inline.last = last_line.original_size();
inline.with_last_segs(|i| {
for seg in last_line.segs() {
@ -1240,7 +1243,7 @@ pub fn layout_text(child: impl UiNode) -> impl UiNode {
for (i, line) in l.shaped_text.lines().enumerate() {
if i == 0 {
let info = l.shaped_text.first_line().unwrap().segs().map(|s| s.inline_info());
let info = l.shaped_text.line(0).unwrap().segs().map(|s| s.inline_info());
if LAYOUT.direction().is_rtl() {
// help sort
inline.set_first_segs(info.rev());
@ -1248,7 +1251,12 @@ pub fn layout_text(child: impl UiNode) -> impl UiNode {
inline.set_first_segs(info);
}
} else if i == last_line {
let info = l.shaped_text.last_line().unwrap().segs().map(|s| s.inline_info());
let info = l
.shaped_text
.line(l.shaped_text.lines_len().saturating_sub(1))
.unwrap()
.segs()
.map(|s| s.inline_info());
if LAYOUT.direction().is_rtl() {
// help sort
inline.set_last_segs(info.rev());

View File

@ -928,7 +928,7 @@ impl CaretStatus {
} else {
let mut line = 1;
let mut line_start = 0;
for seg in text.segments() {
for seg in text.segs() {
if seg.end > index {
break;
}
@ -992,8 +992,15 @@ pub enum LinesWrapCount {
/// The associated value is a vec of wrap-line count for each text line, is `1` for lines that don't wrap.
Wrap(Vec<u32>),
}
impl LinesWrapCount {}
impl LinesWrapCount {
/// Gets the number of text lines.
pub fn lines_len(&self) -> usize {
match self {
Self::NoWrap(l) => *l,
Self::Wrap(lns) => lns.len(),
}
}
}
/// Text paragraph properties.
///