From 796cb60888f05dfc1d70729327eda038c980c455 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 30 Jan 2025 11:30:41 +0800 Subject: [PATCH 01/10] build: introduce unicode-segmentation Signed-off-by: tison --- Cargo.toml | 7 +++++-- benches/build_large_table.rs | 8 ++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7c2afae..7006b29 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,15 +53,18 @@ debug = [] integration_test = [] [dependencies] +unicode-segmentation = { version = "1" } +unicode-width = { version = "0.2" } + +# Optional dependencies ansi-str = { version = "0.8", optional = true } console = { version = "0.15", optional = true } -unicode-width = "0.2" [dev-dependencies] criterion = "0.5" pretty_assertions = "1" proptest = "1" -rand = "0.8" +rand = "0.9" rstest = "0.24" # We don't need any of the default features for crossterm. diff --git a/benches/build_large_table.rs b/benches/build_large_table.rs index 10ff442..73949ed 100644 --- a/benches/build_large_table.rs +++ b/benches/build_large_table.rs @@ -2,11 +2,11 @@ use criterion::{criterion_group, criterion_main, Criterion}; use comfy_table::presets::UTF8_FULL; use comfy_table::*; -use rand::distributions::Alphanumeric; +use rand::distr::Alphanumeric; use rand::Rng; /// Create a dynamic 10x500 Table with width 300 and unevenly distributed content. -/// There're no constriant, the content simply has to be formatted to fit as good as possible into +/// There are no constraint, the content simply has to be formatted to fit as good as possible into /// the given space. fn build_huge_table() { let mut table = Table::new(); @@ -16,12 +16,12 @@ fn build_huge_table() { .set_width(300) .set_header(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); // Create a 10x10 grid for _ in 0..500 { let mut row = Vec::new(); for _ in 0..10 { - let string_length = rng.gen_range(2..100); + let string_length = rng.random_range(2..100); let random_string: String = (&mut rng) .sample_iter(&Alphanumeric) .take(string_length) From f0e3d96b8b583fca869074845f541669b6e7829f Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 30 Jan 2025 11:33:03 +0800 Subject: [PATCH 02/10] add test case Signed-off-by: tison --- src/utils/formatting/content_split/normal.rs | 27 ++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/utils/formatting/content_split/normal.rs b/src/utils/formatting/content_split/normal.rs index ea510c9..fb1a92e 100644 --- a/src/utils/formatting/content_split/normal.rs +++ b/src/utils/formatting/content_split/normal.rs @@ -48,3 +48,30 @@ pub fn split_long_word(allowed_width: usize, word: &str) -> (String, String) { let remaining = char_iter.collect(); (parts, remaining) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_split_long_word() { + let emoji = "🙂‍↕️"; // U+1F642 U+200D U+2195 U+FE0F head shaking vertically + assert_eq!(emoji.len(), 13); + assert_eq!(emoji.chars().count(), 4); + assert_eq!(emoji.width(), 2); + + let (word, remaining) = split_long_word(emoji.width(), &emoji); + + println!("{word}"); + assert_eq!(word, "\u{1F642}\u{200D}"); + assert_eq!(word.len(), 7); + assert_eq!(word.chars().count(), 2); + assert_eq!(word.width(), 2); + + println!("{remaining}"); + assert_eq!(remaining, "\u{2195}\u{FE0F}"); + assert_eq!(remaining.len(), 6); + assert_eq!(remaining.chars().count(), 2); + assert_eq!(remaining.width(), 2); + } +} From 1b0eb26ad4178c6041630bad249134429d5723c2 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 30 Jan 2025 11:36:35 +0800 Subject: [PATCH 03/10] fix it Signed-off-by: tison --- src/utils/formatting/content_split/normal.rs | 26 +++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/utils/formatting/content_split/normal.rs b/src/utils/formatting/content_split/normal.rs index fb1a92e..800f86d 100644 --- a/src/utils/formatting/content_split/normal.rs +++ b/src/utils/formatting/content_split/normal.rs @@ -1,4 +1,5 @@ -use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; +use unicode_segmentation::UnicodeSegmentation; +use unicode_width::UnicodeWidthStr; /// returns printed length of string /// if ansi feature enabled, takes into account escape codes @@ -22,12 +23,12 @@ pub fn split_long_word(allowed_width: usize, word: &str) -> (String, String) { let mut current_width = 0; let mut parts = String::new(); - let mut char_iter = word.chars().peekable(); + let mut char_iter = word.graphemes(true).peekable(); // Check if the string might be too long, one character at a time. // Peek into the next char and check the exit condition. // That is, pushing the next character would result in the string being too long. while let Some(c) = char_iter.peek() { - if (current_width + c.width().unwrap_or(1)) > allowed_width { + if (current_width + c.width()) > allowed_width { break; } @@ -36,12 +37,12 @@ pub fn split_long_word(allowed_width: usize, word: &str) -> (String, String) { // We default to 1 char, if the character length cannot be determined. // The user has to live with this, if they decide to add control characters or some fancy - // stuff into their tables. This is considered undefined behavior and we try to handle this + // stuff into their tables. This is considered undefined behavior, and we try to handle this // to the best of our capabilities. - let character_width = c.width().unwrap_or(1); + let character_width = c.width(); current_width += character_width; - parts.push(c); + parts.push_str(c); } // Collect the remaining characters. @@ -62,16 +63,11 @@ mod tests { let (word, remaining) = split_long_word(emoji.width(), &emoji); - println!("{word}"); - assert_eq!(word, "\u{1F642}\u{200D}"); - assert_eq!(word.len(), 7); - assert_eq!(word.chars().count(), 2); + assert_eq!(word, "\u{1F642}\u{200D}\u{2195}\u{FE0F}"); + assert_eq!(word.len(), 13); + assert_eq!(word.chars().count(), 4); assert_eq!(word.width(), 2); - println!("{remaining}"); - assert_eq!(remaining, "\u{2195}\u{FE0F}"); - assert_eq!(remaining.len(), 6); - assert_eq!(remaining.chars().count(), 2); - assert_eq!(remaining.width(), 2); + assert!(remaining.is_empty()); } } From 02fbcd47d8e14dc4845e291cc96857d4e0e0c9e1 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 30 Jan 2025 11:43:25 +0800 Subject: [PATCH 04/10] fmt Signed-off-by: tison --- src/utils/formatting/content_split/normal.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/formatting/content_split/normal.rs b/src/utils/formatting/content_split/normal.rs index 800f86d..02cc0e2 100644 --- a/src/utils/formatting/content_split/normal.rs +++ b/src/utils/formatting/content_split/normal.rs @@ -61,7 +61,7 @@ mod tests { assert_eq!(emoji.chars().count(), 4); assert_eq!(emoji.width(), 2); - let (word, remaining) = split_long_word(emoji.width(), &emoji); + let (word, remaining) = split_long_word(emoji.width(), emoji); assert_eq!(word, "\u{1F642}\u{200D}\u{2195}\u{FE0F}"); assert_eq!(word.len(), 13); From 1ed8bbe876b8e4d31e9a45753e08911acbb5688b Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 30 Jan 2025 11:47:29 +0800 Subject: [PATCH 05/10] for custom_styling Signed-off-by: tison --- .../content_split/custom_styling.rs | 7 +++--- src/utils/formatting/content_split/mod.rs | 23 +++++++++++++++++++ src/utils/formatting/content_split/normal.rs | 22 ------------------ 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/src/utils/formatting/content_split/custom_styling.rs b/src/utils/formatting/content_split/custom_styling.rs index c9c01a7..1e3f8ee 100644 --- a/src/utils/formatting/content_split/custom_styling.rs +++ b/src/utils/formatting/content_split/custom_styling.rs @@ -1,4 +1,5 @@ use ansi_str::AnsiStr; +use unicode_segmentation::UnicodeSegmentation; use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; const ANSI_RESET: &str = "\u{1b}[0m"; @@ -84,16 +85,16 @@ pub fn split_long_word(allowed_width: usize, word: &str) -> (String, String) { } } else { assert!(!is_esc); - let mut char_iter = str_slice.chars().peekable(); + let mut char_iter = str_slice.graphemes(true).peekable(); while let Some(c) = char_iter.peek() { - let character_width = c.width().unwrap_or(0); + let character_width = c.width(); if allowed_width < head_len + character_width { break; } head_len += character_width; let c = char_iter.next().unwrap(); - head.push(c); + head.push_str(c); // c is not escape code head_len_last = head.len(); diff --git a/src/utils/formatting/content_split/mod.rs b/src/utils/formatting/content_split/mod.rs index aa70042..6f436d3 100644 --- a/src/utils/formatting/content_split/mod.rs +++ b/src/utils/formatting/content_split/mod.rs @@ -158,3 +158,26 @@ fn check_if_full(lines: &mut Vec, content_width: usize, current_line: St current_line } + +#[cfg(test)] +mod tests { + use super::*; + use unicode_width::UnicodeWidthStr; + + #[test] + fn test_split_long_word() { + let emoji = "🙂‍↕️"; // U+1F642 U+200D U+2195 U+FE0F head shaking vertically + assert_eq!(emoji.len(), 13); + assert_eq!(emoji.chars().count(), 4); + assert_eq!(emoji.width(), 2); + + let (word, remaining) = split_long_word(emoji.width(), emoji); + + assert_eq!(word, "\u{1F642}\u{200D}\u{2195}\u{FE0F}"); + assert_eq!(word.len(), 13); + assert_eq!(word.chars().count(), 4); + assert_eq!(word.width(), 2); + + assert!(remaining.is_empty()); + } +} diff --git a/src/utils/formatting/content_split/normal.rs b/src/utils/formatting/content_split/normal.rs index 02cc0e2..f9dac36 100644 --- a/src/utils/formatting/content_split/normal.rs +++ b/src/utils/formatting/content_split/normal.rs @@ -49,25 +49,3 @@ pub fn split_long_word(allowed_width: usize, word: &str) -> (String, String) { let remaining = char_iter.collect(); (parts, remaining) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_split_long_word() { - let emoji = "🙂‍↕️"; // U+1F642 U+200D U+2195 U+FE0F head shaking vertically - assert_eq!(emoji.len(), 13); - assert_eq!(emoji.chars().count(), 4); - assert_eq!(emoji.width(), 2); - - let (word, remaining) = split_long_word(emoji.width(), emoji); - - assert_eq!(word, "\u{1F642}\u{200D}\u{2195}\u{FE0F}"); - assert_eq!(word.len(), 13); - assert_eq!(word.chars().count(), 4); - assert_eq!(word.width(), 2); - - assert!(remaining.is_empty()); - } -} From bd072911ee0b031e8a6b9b06b0ce7a5baef11ee9 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 30 Jan 2025 11:51:05 +0800 Subject: [PATCH 06/10] fix up comments Signed-off-by: tison --- .../content_split/custom_styling.rs | 8 +++---- src/utils/formatting/content_split/mod.rs | 14 ++++++------- src/utils/formatting/content_split/normal.rs | 21 +++++++------------ 3 files changed, 19 insertions(+), 24 deletions(-) diff --git a/src/utils/formatting/content_split/custom_styling.rs b/src/utils/formatting/content_split/custom_styling.rs index 1e3f8ee..615c483 100644 --- a/src/utils/formatting/content_split/custom_styling.rs +++ b/src/utils/formatting/content_split/custom_styling.rs @@ -85,15 +85,15 @@ pub fn split_long_word(allowed_width: usize, word: &str) -> (String, String) { } } else { assert!(!is_esc); - let mut char_iter = str_slice.graphemes(true).peekable(); - while let Some(c) = char_iter.peek() { + let mut graphmes = str_slice.graphemes(true).peekable(); + while let Some(c) = graphmes.peek() { let character_width = c.width(); if allowed_width < head_len + character_width { break; } head_len += character_width; - let c = char_iter.next().unwrap(); + let c = graphmes.next().unwrap(); head.push_str(c); // c is not escape code @@ -110,7 +110,7 @@ pub fn split_long_word(allowed_width: usize, word: &str) -> (String, String) { for esc in escapes { tail.push_str(esc); } - let remaining: String = char_iter.collect(); + let remaining: String = graphmes.collect(); tail.push_str(&remaining); break; } diff --git a/src/utils/formatting/content_split/mod.rs b/src/utils/formatting/content_split/mod.rs index 6f436d3..13e4b63 100644 --- a/src/utils/formatting/content_split/mod.rs +++ b/src/utils/formatting/content_split/mod.rs @@ -17,15 +17,15 @@ pub use normal::*; /// These "elements" then get added one-by-one to the lines, until a line is full. /// As soon as the line is full, we add it to the result set and start a new line. /// -/// This is repeated until there're no more "elements". +/// This is repeated until there are no more "elements". /// -/// Mid-element splits only occurs if a element doesn't fit in a single line by itself. +/// Mid-element splits only occurs if an element doesn't fit in a single line by itself. pub fn split_line(line: &str, info: &ColumnDisplayInfo, delimiter: char) -> Vec { let mut lines = Vec::new(); let content_width = usize::from(info.content_width); // Split the line by the given deliminator and turn the content into a stack. - // Also clone it and convert it into a Vec. Otherwise we get some burrowing problems + // Also clone it and convert it into a Vec. Otherwise, we get some burrowing problems // due to early drops of borrowed values that need to be inserted into `Vec<&str>` let mut elements = split_line_by_delimiter(line, delimiter); @@ -81,7 +81,7 @@ pub fn split_line(line: &str, info: &ColumnDisplayInfo, delimiter: char) -> Vec< // There are two scenarios: // // 1. The word is too long for a single line. - // In this case, we have to split the element anyways. Let's fill the remaining space on + // In this case, we have to split the element anyway. Let's fill the remaining space on // the current line with, start a new line and push the remaining part on the stack. // 2. The word is short enough to fit as a whole into a line // In that case we simply push the current line and start a new one with the current element @@ -99,9 +99,9 @@ pub fn split_line(line: &str, info: &ColumnDisplayInfo, delimiter: char) -> Vec< let (mut next, mut remaining) = split_long_word(remaining_width, &next); - // This is a ugly hack, but it's needed for now. + // This is an ugly hack, but it's needed for now. // - // Scenario: The current column has to have a width of 1 and we work with a new line. + // Scenario: The current column has to have a width of 1, and we work with a new line. // However, the next char is a multi-character UTF-8 symbol. // // Since a multi-character wide symbol doesn't fit into a 1-character column, @@ -141,7 +141,7 @@ pub fn split_line(line: &str, info: &ColumnDisplayInfo, delimiter: char) -> Vec< /// This is the minimum of available characters per line. /// It's used to check, whether another element can be added to the current line. -/// Otherwise the line will simply be left as it is and we start with a new one. +/// Otherwise, the line will simply be left as it is, and we start with a new one. /// Two chars seems like a reasonable approach, since this would require next element to be /// a single char + delimiter. const MIN_FREE_CHARS: usize = 2; diff --git a/src/utils/formatting/content_split/normal.rs b/src/utils/formatting/content_split/normal.rs index f9dac36..7bbb495 100644 --- a/src/utils/formatting/content_split/normal.rs +++ b/src/utils/formatting/content_split/normal.rs @@ -23,29 +23,24 @@ pub fn split_long_word(allowed_width: usize, word: &str) -> (String, String) { let mut current_width = 0; let mut parts = String::new(); - let mut char_iter = word.graphemes(true).peekable(); - // Check if the string might be too long, one character at a time. - // Peek into the next char and check the exit condition. - // That is, pushing the next character would result in the string being too long. - while let Some(c) = char_iter.peek() { + let mut graphmes = word.graphemes(true).peekable(); + // Check if the string might be too long, one Unicode grapheme at a time. + // Peek into the next grapheme and check the exit condition. + // That is, pushing the next grapheme would result in the string being too long. + while let Some(c) = graphmes.peek() { if (current_width + c.width()) > allowed_width { break; } - // We can unwrap, as we just checked that a suitable character is next in line. - let c = char_iter.next().unwrap(); + // We can unwrap, as we just checked that a suitable grapheme is next in line. + let c = graphmes.next().unwrap(); - // We default to 1 char, if the character length cannot be determined. - // The user has to live with this, if they decide to add control characters or some fancy - // stuff into their tables. This is considered undefined behavior, and we try to handle this - // to the best of our capabilities. let character_width = c.width(); - current_width += character_width; parts.push_str(c); } // Collect the remaining characters. - let remaining = char_iter.collect(); + let remaining = graphmes.collect(); (parts, remaining) } From 491457f603f6fd22318548d08b7c7dfac2c59410 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 30 Jan 2025 11:56:43 +0800 Subject: [PATCH 07/10] fixup CI Signed-off-by: tison --- .github/workflows/test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 959e7b4..e6a40f6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -36,6 +36,11 @@ jobs: - target: wasm32-wasi os: ubuntu-latest minimal_setup: true + toolchain: "1.74" + - target: wasm32-wasip1 + os: ubuntu-latest + minimal_setup: true + toolchain: "stable" - target: x86_64-pc-windows-msvc os: windows-latest minimal_setup: false From 69a74151b2f17fed09d38b5e5ff1de89c357ba46 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 30 Jan 2025 12:00:23 +0800 Subject: [PATCH 08/10] try fix GHA matric Signed-off-by: tison --- .github/workflows/test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e6a40f6..8cee0d0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,6 @@ jobs: - x86_64-unknown-linux-gnu - x86_64-pc-windows-msvc - x86_64-apple-darwin - - wasm32-wasi toolchain: [stable, "1.74"] include: - target: x86_64-unknown-linux-gnu From 910ebfc925465c3f770cacb5cf7dc318d86f0e18 Mon Sep 17 00:00:00 2001 From: tison Date: Sun, 2 Feb 2025 19:31:29 +0800 Subject: [PATCH 09/10] address comments Signed-off-by: tison --- src/utils/formatting/content_split/normal.rs | 9 +++++- tests/all/utf_8_characters.rs | 29 ++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/utils/formatting/content_split/normal.rs b/src/utils/formatting/content_split/normal.rs index 7bbb495..036efa5 100644 --- a/src/utils/formatting/content_split/normal.rs +++ b/src/utils/formatting/content_split/normal.rs @@ -24,9 +24,16 @@ pub fn split_long_word(allowed_width: usize, word: &str) -> (String, String) { let mut parts = String::new(); let mut graphmes = word.graphemes(true).peekable(); + // Check if the string might be too long, one Unicode grapheme at a time. // Peek into the next grapheme and check the exit condition. - // That is, pushing the next grapheme would result in the string being too long. + // + // This code uses graphemes to handle both zero-width joiner[0] UTF-8 chars, which + // combine multiple UTF-8 chars into a single grapheme, and variant selectors [1], + // which pick a certain variant of the preceding char. + // + // [0]: https://en.wikipedia.org/wiki/Zero-width_joiner + // [1]: https://en.wikipedia.org/wiki/Variation_Selectors_(Unicode_block) while let Some(c) = graphmes.peek() { if (current_width + c.width()) > allowed_width { break; diff --git a/tests/all/utf_8_characters.rs b/tests/all/utf_8_characters.rs index 938b39c..7a3db19 100644 --- a/tests/all/utf_8_characters.rs +++ b/tests/all/utf_8_characters.rs @@ -56,3 +56,32 @@ fn multi_character_utf8_word_splitting() { println!("{expected}"); assert_eq!(expected, "\n".to_string() + &table.to_string()); } + +/// Handle emojis that'd joined via the "zero-width joiner" character U+200D and contain variant +/// selectors. +/// +/// Those composite emojis should be handled as a single grapheme and thereby have their width +/// calculated based on the grapheme length instead of the individual chars. +/// +/// This is also a regression test, as previously emojis were split in the middle of the joiner +/// sequence, resulting in two different emojis on different lines. +#[test] +fn zwj_utf8_word_splitting() { + let mut table = Table::new(); + table + .set_width(8) + .set_content_arrangement(ContentArrangement::Dynamic) + .set_header(vec!["test"]) + .add_row(vec!["ab🙂‍↕️def"]); + + println!("{table}"); + let expected = " ++------+ +| test | ++======+ +| ab🙂‍↕️ | +| def | ++------+"; + println!("{expected}"); + assert_eq!(expected, "\n".to_string() + &table.to_string()); +} From b7700aea7240879bd130731fc01c6754f9c8b729 Mon Sep 17 00:00:00 2001 From: tison Date: Mon, 3 Feb 2025 17:07:32 +0800 Subject: [PATCH 10/10] add multi_character_cjk_word_splitting case Signed-off-by: tison --- tests/all/utf_8_characters.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/all/utf_8_characters.rs b/tests/all/utf_8_characters.rs index 7a3db19..92932e0 100644 --- a/tests/all/utf_8_characters.rs +++ b/tests/all/utf_8_characters.rs @@ -57,6 +57,29 @@ fn multi_character_utf8_word_splitting() { assert_eq!(expected, "\n".to_string() + &table.to_string()); } +#[test] +fn multi_character_cjk_word_splitting() { + let mut table = Table::new(); + table + .set_width(8) + .set_content_arrangement(ContentArrangement::Dynamic) + .set_header(vec!["test"]) + .add_row(vec!["abc新年快乐edf"]); + + println!("{table}"); + let expected = " ++------+ +| test | ++======+ +| abc | +| 新年 | +| 快乐 | +| edf | ++------+"; + println!("{expected}"); + assert_eq!(expected, "\n".to_string() + &table.to_string()); +} + /// Handle emojis that'd joined via the "zero-width joiner" character U+200D and contain variant /// selectors. ///