diff --git a/benches/read_parquet.rs b/benches/read_parquet.rs index f455c66434b..74581faec49 100644 --- a/benches/read_parquet.rs +++ b/benches/read_parquet.rs @@ -42,6 +42,11 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| read_decompressed_pages(&buffer, size * 8, 2).unwrap()) }); + let a = format!("read utf8 large emoji 2^{}", i); + c.bench_function(&a, |b| { + b.iter(|| read_decompressed_pages(&buffer, size * 8, 6).unwrap()) + }); + let a = format!("read bool 2^{}", i); c.bench_function(&a, |b| { b.iter(|| read_decompressed_pages(&buffer, size * 8, 3).unwrap()) diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py index 1cda4465af2..891dc0f683a 100644 --- a/parquet_integration/write_parquet.py +++ b/parquet_integration/write_parquet.py @@ -10,6 +10,7 @@ def case_basic_nullable(size=1): float64 = [0.0, 1.0, None, 3.0, None, 5.0, 6.0, 7.0, None, 9.0] string = ["Hello", None, "aa", "", None, "abc", None, None, "def", "aaa"] boolean = [True, None, False, False, None, True, None, None, True, True] + string_large_emoji = ["😃🌚🕳👊🚅🚑🎐🚘✨⛎⛹📔🔫😭🀄️🏗🚵🍒⏮🏎🎼🌥🌀🕎⛴💀™️📈🍋🅿️🌉✅⚜🏓🏜💅📖🚾🛤☺️☑️🕊🌁📡💵📮🌷💡🍩🏬💫🏩🍵🎼◽️❌♥️🛌🕹🍰🗄💷▪️🔲🏛👡👽🍭🛤▶️🍫😵🏔🎁🌫☎️✈️〰️👚🐫🍺🎢🔵👊🗒🆘🎡💌♋️➕♉️🖐🎶📒™️👛😆👠🐛🌫🦄⚫️😕🍙🕠♨️➿🔰💺🕳😶👳😙🌧🍽🏘🐰🍗🍲🐏🌂🌆🗂🚀👓↘️📀🔰"] * 10 fields = [ pa.field("int64", pa.int64()), @@ -18,6 +19,7 @@ def case_basic_nullable(size=1): pa.field("bool", pa.bool_()), pa.field("date", pa.timestamp("ms")), pa.field("uint32", pa.uint32()), + pa.field("string_large_emoji", pa.utf8()), ] schema = pa.schema(fields) @@ -29,6 +31,7 @@ def case_basic_nullable(size=1): "bool": boolean * size, "date": int64 * size, "uint32": int64 * size, + "string_large_emoji": string_large_emoji * size, }, schema, f"basic_nullable_{size*10}.parquet",