Text API work, emoji segmentation.

2023-06-17 18:54:17 -03:00 · 2023-06-17 18:54:17 -03:00 · 012f54e470
parent 24db6ca231
commit 012f54e470
13 changed files with 8148 additions and 55 deletions
--- a/examples/res/text/unicode-emoji-15.0/ReadMe.txt
+++ b/examples/res/text/unicode-emoji-15.0/ReadMe.txt
@ -0,0 +1,21 @@
+# Unicode Emoji
+# © 2022 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use, see https://www.unicode.org/terms_of_use.html
+
+This directory contains final data files for Unicode Emoji, Version 15.0
+
+Public/emoji/15.0/
+
+  emoji-sequences.txt
+  emoji-zwj-sequences.txt
+  emoji-test.txt
+
+The following related files are found in the UCD for Version 15.0
+
+Public/15.0.0/ucd/emoji/
+
+  emoji-data.txt
+  emoji-variation-sequences.txt
+
+For documentation, see UTS #51 Unicode Emoji, Version 15.0
--- a/examples/res/text/unicode-emoji-15.0/emoji-sequences.txt
+++ b/examples/res/text/unicode-emoji-15.0/emoji-sequences.txt
--- a/examples/res/text/unicode-emoji-15.0/emoji-test.txt
+++ b/examples/res/text/unicode-emoji-15.0/emoji-test.txt
--- a/examples/res/text/unicode-emoji-15.0/emoji-zwj-sequences.txt
+++ b/examples/res/text/unicode-emoji-15.0/emoji-zwj-sequences.txt
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@ -28,4 +28,8 @@ harness = false

 [[test]]
 name = "config"
-path = "config.rs"
+path = "config.rs"
+
+[[test]]
+name = "text"
+path = "text.rs"
--- a/tests/text.rs
+++ b/tests/text.rs
@ -0,0 +1,45 @@
+use zero_ui::core::{context::LayoutDirection, text::*};
+
+use std::fmt::Write as _;
+
+#[test]
+fn emoji_segs() {
+    let tests = std::fs::read_to_string("../examples/res/text/unicode-emoji-15.0/emoji-test.txt").unwrap();
+
+    let mut errors = String::new();
+    let mut error_count = 0;
+
+    for line in tests.lines() {
+        if line.starts_with('#') || line.is_empty() {
+            continue;
+        }
+
+        let line = if let Some((_, test)) = line.split_once(';') {
+            if !test.starts_with(" fully-qualified") && !test.starts_with(" component") {
+                continue;
+            }
+            test
+        } else {
+            continue;
+        };
+
+        if let Some((_, test)) = line.split_once('#') {
+            let txt = SegmentedText::new(Txt::from_str(test), LayoutDirection::LTR);
+            let k: Vec<_> = txt.segs().iter().map(|s| s.kind).take(3).collect();
+
+            if k != vec![TextSegmentKind::Space, TextSegmentKind::Emoji, TextSegmentKind::Space] {
+                error_count += 1;
+                if error_count <= 20 {
+                    let _ = writeln!(&mut errors, "{test}");
+                }
+            }
+        }
+    }
+
+    if !errors.is_empty() {
+        if error_count > 20 {
+            let _ = writeln!(&mut errors, "\n..and {} more errors", error_count - 20);
+        }
+        panic!("\n\n{errors}");
+    }
+}
--- a/zero-ui-core/Cargo.toml
+++ b/zero-ui-core/Cargo.toml
@ -101,7 +101,8 @@ hyphenation = { version = "0.8", default-features = false }
 regex = "1"
 unicode-bidi = "0.3"
 unicode-segmentation = "1"
-unic-emoji-char = "0.9"
+icu_properties = "1"
+icu_testdata = "1"

 # task
 flume = { version = "0.10", default-features  = false, features = ["async"] }
--- a/zero-ui-core/src/text.rs
+++ b/zero-ui-core/src/text.rs
@ -14,6 +14,7 @@ use std::{
    sync::Arc,
 };

+mod emoji_util;
 pub mod font_features;
 mod font_kit_cache;
 mod unicode_bidi_util;
--- a/zero-ui-core/src/text/emoji_util.rs
+++ b/zero-ui-core/src/text/emoji_util.rs
@ -0,0 +1,37 @@
+use icu_properties::sets;
+
+pub(super) fn maybe_emoji(c: char) -> bool {
+    sets::load_emoji(&icu_testdata::unstable()).unwrap().as_borrowed().contains(c)
+}
+
+pub(super) fn definitely_emoji(c: char) -> bool {
+    sets::load_emoji_presentation(&icu_testdata::unstable())
+        .unwrap()
+        .as_borrowed()
+        .contains(c)
+        || is_modifier(c)
+}
+
+pub(super) fn is_modifier(c: char) -> bool {
+    sets::load_emoji_modifier(&icu_testdata::unstable())
+        .unwrap()
+        .as_borrowed()
+        .contains(c)
+}
+
+pub(super) fn is_component(c: char) -> bool {
+    sets::load_emoji_component(&icu_testdata::unstable())
+        .unwrap()
+        .as_borrowed()
+        .contains(c)
+}
+
+/*
+Loaded data is !Send+!Sync so we probably don't need to cache it.
+
+The "ucu_testdata" includes the stuff we need, plus a lot of useless data, there is a complicated way to
+optmize this, but they are about to release embedded data, so we wait.
+
+see: see https://github.com/unicode-org/icu4x/issues/3529
+
+ */
--- a/zero-ui-core/src/text/segmenting.rs
+++ b/zero-ui-core/src/text/segmenting.rs
@ -1,6 +1,6 @@
 use std::ops;

-use crate::{context::LayoutDirection, crate_util::FxHashMap};
+use crate::{context::LayoutDirection, crate_util::FxHashMap, text::emoji_util};

 use super::Txt;
 use unicode_bidi::BidiInfo;
@ -232,31 +232,43 @@ impl SegmentedText {
            base_direction,
        }
    }
+
    fn push_seg(text: &str, bidi: &BidiInfo, segs: &mut Vec<TextSegment>, end: usize) {
        let start = segs.last().map(|s| s.end).unwrap_or(0);

+        let mut char_indices = text[start..end].char_indices().peekable();
+
        let mut kind = TextSegmentKind::LeftToRight;
        let mut level = BidiLevel::ltr();
-        for (i, c) in text[start..end].char_indices() {
-            let c_kind = if unic_emoji_char::is_emoji(c) {
-                TextSegmentKind::Emoji
-            } else {
-                const ZWJ: char = '\u{200D}'; // ZERO WIDTH JOINER
-                const VS0: char = '\u{FE00}'; // VARIANT SELECTOR 0
-                const VS16: char = '\u{FE0F}'; // VARIANT SELECTOR 16
-                if matches!(kind, TextSegmentKind::Emoji) && (c == ZWJ || (VS0..=VS16).contains(&c)) {
-                    TextSegmentKind::Emoji
-                } else {
-                    match TextSegmentKind::from(bidi.original_classes[start + i]) {
-                        TextSegmentKind::OtherNeutral if super::unicode_bidi_util::bidi_bracket_data(c).is_some() => {
-                            TextSegmentKind::Bracket(c)
-                        }
-                        k => k,
-                    }
-                }
-            };
+        for (i, c) in &mut char_indices {
+            const ZWJ: char = '\u{200D}'; // ZERO WIDTH JOINER
+            const VS16: char = '\u{FE0F}'; // VARIANT SELECTOR 16 - Emoji
+            const CEK: char = '\u{20E3}'; // COMBINING ENCLOSING KEYCAP

-            let c_level = bidi.levels[start + i];
+            let is_emoji = (kind == TextSegmentKind::Emoji // maybe
+                && (
+                    c == VS16 // definitely, modifies prev. char into Emoji.
+                    || c == CEK // definitely, modified prev. char into keycap style.
+                    || c == ZWJ // definitely, ligature with the next Emoji or is ignored.
+                    || emoji_util::is_modifier(c) // definitely, has same effect as VS16.
+                    || emoji_util::is_component(c) // definitely, ligature data, like flag tags.
+                ))
+                || (emoji_util::maybe_emoji(c) // maybe
+                    && (emoji_util::definitely_emoji(c) // definitely
+                        // only if followed by VS16 or modifier
+                        || (text[start+i..].chars().nth(1).map(|c| c == VS16 || emoji_util::is_modifier(c)).unwrap_or(false))));
+
+            let (c_kind, c_level) = if is_emoji {
+                (TextSegmentKind::Emoji, level)
+            } else {
+                let k = match TextSegmentKind::from(bidi.original_classes[start + i]) {
+                    TextSegmentKind::OtherNeutral if super::unicode_bidi_util::bidi_bracket_data(c).is_some() => {
+                        TextSegmentKind::Bracket(c)
+                    }
+                    k => k,
+                };
+                (k, bidi.levels[start + i])
+            };

            if c_kind != kind || c_level != level || !c_kind.can_merge() {
                if i > 0 {
@ -279,7 +291,7 @@ impl SegmentedText {
    }

    /// The raw segment data.
-    pub fn segments(&self) -> &[TextSegment] {
+    pub fn segs(&self) -> &[TextSegment] {
        &self.segments
    }

@ -676,13 +688,30 @@ mod tests {

    #[test]
    fn emoji_seg() {
-        let test = "'🙎🏻‍♀️'";
+        let test = "'🙎🏻‍♀️'1# 1️⃣#️⃣";
        let txt = SegmentedText::new(test, LayoutDirection::LTR);
-        let k: Vec<_> = txt.segments().iter().map(|s| s.kind).collect();
+        let k: Vec<_> = txt.segs().iter().map(|s| s.kind).collect();

        assert_eq!(
-            vec![TextSegmentKind::OtherNeutral, TextSegmentKind::Emoji, TextSegmentKind::OtherNeutral],
+            vec![
+                TextSegmentKind::OtherNeutral,       // '
+                TextSegmentKind::Emoji,              // 🙎🏻‍♀️
+                TextSegmentKind::OtherNeutral,       // '
+                TextSegmentKind::EuropeanNumber,     // 1
+                TextSegmentKind::EuropeanTerminator, // #
+                TextSegmentKind::Space,
+                TextSegmentKind::Emoji, // 1️⃣#️⃣
+            ],
            k
        );
    }
+
+    #[test]
+    fn emoji_issues() {
+        let test = "🏴󠁧󠁢󠁥󠁮󠁧󠁿";
+        let txt = SegmentedText::new(test, LayoutDirection::LTR);
+        for (t, seg) in txt.iter() {
+            assert_eq!(seg.kind, TextSegmentKind::Emoji, "text: {t:?}");
+        }
+    }
 }
--- a/zero-ui-core/src/text/shaping.rs
+++ b/zero-ui-core/src/text/shaping.rs
@ -491,8 +491,12 @@ impl ShapedText {
            let block_size = self.block_size();
            let align_size = constraints.fill_size_or(block_size);

-            let mut first = PxRect::from_size(self.first_line().map(|l| l.rect().size).unwrap_or_default());
-            let mut last = PxRect::from_size(self.last_line().map(|l| l.rect().size).unwrap_or_default());
+            let mut first = PxRect::from_size(self.line(0).map(|l| l.rect().size).unwrap_or_default());
+            let mut last = PxRect::from_size(
+                self.line(self.lines_len().saturating_sub(1))
+                    .map(|l| l.rect().size)
+                    .unwrap_or_default(),
+            );
            last.origin.y = block_size.height - last.size.height;

            first.origin.x = (align_size.width - first.size.width) * align_x;
@ -774,23 +778,15 @@ impl ShapedText {
        self.first_wrapped
    }

-    /// Gets the first line, if the text contains any line.
-    pub fn first_line(&self) -> Option<ShapedLine> {
-        self.lines().next()
-    }
-
-    /// Gets the last line, if the text contains any line.
-    ///
-    /// This is more efficient than `t.lines().last()`.
-    pub fn last_line(&self) -> Option<ShapedLine> {
-        if self.lines.0.is_empty() {
+    /// Gets the line by index.
+    pub fn line(&self, line_idx: usize) -> Option<ShapedLine> {
+        if self.lines.0.len() <= line_idx {
            None
        } else {
-            let last_line = self.lines.0.len() - 1;
-            self.lines.iter_segs_skip(last_line).next().map(move |(w, r)| ShapedLine {
+            self.lines.iter_segs_skip(line_idx).next().map(move |(w, r)| ShapedLine {
                text: self,
                seg_range: r,
-                index: last_line,
+                index: line_idx,
                width: Px(w.round() as i32),
            })
        }
@ -919,7 +915,7 @@ impl ShapedText {
            }
        }

-        if let Some(line) = self.last_line() {
+        if let Some(line) = self.line(self.lines_len().saturating_sub(1)) {
            // top-right of last line
            let rect = line.rect();
            PxPoint::new(rect.max_x(), rect.min_y())
@ -932,9 +928,9 @@ impl ShapedText {
    pub fn nearest_line(&self, y: Px) -> Option<ShapedLine> {
        let first_line_max_y = self.first_line.max_y();
        if first_line_max_y >= y {
-            self.first_line()
+            self.line(0)
        } else if self.last_line.min_y() <= y {
-            self.last_line()
+            self.line(self.lines_len().saturating_sub(1))
        } else {
            let y = y - first_line_max_y;
            let line = (y / self.line_height()).0 as usize + 1;
@ -1705,6 +1701,21 @@ impl<'a> ShapedLine<'a> {
        self.seg_range.len()
    }

+    /// Get the segment by index.
+    ///
+    /// The first segment of the line is `0`.
+    pub fn seg(&self, seg_idx: usize) -> Option<ShapedSegment> {
+        if self.seg_range.len() > seg_idx {
+            Some(ShapedSegment {
+                text: self.text,
+                line_index: self.index,
+                index: seg_idx + self.seg_range.start(),
+            })
+        } else {
+            None
+        }
+    }
+
    /// Returns `true` if this line was started by the wrap algorithm.
    ///
    /// If this is `false` then the line is the first or the previous line ends in a [`LineBreak`].
--- a/zero-ui/src/widgets/text/nodes.rs
+++ b/zero-ui/src/widgets/text/nodes.rs
@ -752,16 +752,19 @@ pub fn layout_text(child: impl UiNode) -> impl UiNode {
                    }
                    InlineConstraints::Layout(l) => {
                        if !self.pending.contains(PendingLayout::RESHAPE)
-                            && (Some(l.first_segs.len()) != r.shaped_text.first_line().map(|l| l.segs_len())
-                                || Some(l.last_segs.len()) != r.shaped_text.last_line().map(|l| l.segs_len()))
+                            && (Some(l.first_segs.len()) != r.shaped_text.line(0).map(|l| l.segs_len())
+                                || Some(l.last_segs.len())
+                                    != r.shaped_text
+                                        .line(r.shaped_text.lines_len().saturating_sub(1))
+                                        .map(|l| l.segs_len()))
                        {
                            self.pending.insert(PendingLayout::RESHAPE);
                        }

                        if !self.pending.contains(PendingLayout::RESHAPE_LINES)
                            && (r.shaped_text.mid_clear() != l.mid_clear
-                                || r.shaped_text.first_line().map(|l| l.rect()) != Some(l.first)
-                                || r.shaped_text.last_line().map(|l| l.rect()) != Some(l.last))
+                                || r.shaped_text.line(0).map(|l| l.rect()) != Some(l.first)
+                                || r.shaped_text.line(r.shaped_text.lines_len().saturating_sub(1)).map(|l| l.rect()) != Some(l.last))
                        {
                            self.pending.insert(PendingLayout::RESHAPE_LINES);
                        }
@ -1181,7 +1184,7 @@ pub fn layout_text(child: impl UiNode) -> impl UiNode {
                let size = txt.layout(&metrics, &RESOLVED_TEXT.get(), true);

                if let (Some(inline), Some(l)) = (wm.inline(), txt.txt.as_ref()) {
-                    if let Some(first_line) = l.shaped_text.first_line() {
+                    if let Some(first_line) = l.shaped_text.line(0) {
                        inline.first = first_line.original_size();
                        inline.with_first_segs(|i| {
                            for seg in first_line.segs() {
@ -1199,7 +1202,7 @@ pub fn layout_text(child: impl UiNode) -> impl UiNode {
                    if l.shaped_text.lines_len() == 1 {
                        inline.last = inline.first;
                        inline.last_segs = inline.first_segs.clone();
-                    } else if let Some(last_line) = l.shaped_text.last_line() {
+                    } else if let Some(last_line) = l.shaped_text.line(l.shaped_text.lines_len().saturating_sub(1)) {
                        inline.last = last_line.original_size();
                        inline.with_last_segs(|i| {
                            for seg in last_line.segs() {
@ -1240,7 +1243,7 @@ pub fn layout_text(child: impl UiNode) -> impl UiNode {

                for (i, line) in l.shaped_text.lines().enumerate() {
                    if i == 0 {
-                        let info = l.shaped_text.first_line().unwrap().segs().map(|s| s.inline_info());
+                        let info = l.shaped_text.line(0).unwrap().segs().map(|s| s.inline_info());
                        if LAYOUT.direction().is_rtl() {
                            // help sort
                            inline.set_first_segs(info.rev());
@ -1248,7 +1251,12 @@ pub fn layout_text(child: impl UiNode) -> impl UiNode {
                            inline.set_first_segs(info);
                        }
                    } else if i == last_line {
-                        let info = l.shaped_text.last_line().unwrap().segs().map(|s| s.inline_info());
+                        let info = l
+                            .shaped_text
+                            .line(l.shaped_text.lines_len().saturating_sub(1))
+                            .unwrap()
+                            .segs()
+                            .map(|s| s.inline_info());
                        if LAYOUT.direction().is_rtl() {
                            // help sort
                            inline.set_last_segs(info.rev());
--- a/zero-ui/src/widgets/text/text_properties.rs
+++ b/zero-ui/src/widgets/text/text_properties.rs
@ -928,7 +928,7 @@ impl CaretStatus {
        } else {
            let mut line = 1;
            let mut line_start = 0;
-            for seg in text.segments() {
+            for seg in text.segs() {
                if seg.end > index {
                    break;
                }
@ -992,8 +992,15 @@ pub enum LinesWrapCount {
    /// The associated value is a vec of wrap-line count for each text line, is `1` for lines that don't wrap.
    Wrap(Vec<u32>),
 }
-
-impl LinesWrapCount {}
+impl LinesWrapCount {
+    /// Gets the number of text lines.
+    pub fn lines_len(&self) -> usize {
+        match self {
+            Self::NoWrap(l) => *l,
+            Self::Wrap(lns) => lns.len(),
+        }
+    }
+}

 /// Text paragraph properties.
 ///