Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AK: Replace Unicode validation, conversion, and length computation with simdutf #674

Merged
merged 5 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions .github/workflows/lagom-template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,21 +104,20 @@ jobs:
run: |
set -e

cmake -GNinja -S Meta/Lagom -B ${{ github.workspace }}/tools-build \
cmake --preset=CI -S Meta/Lagom -B ${{ github.workspace }}/Build/tools-build \
-DLAGOM_TOOLS_ONLY=ON \
-DINSTALL_LAGOM_TOOLS=ON \
-DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/tool-install \
-DSERENITY_CACHE_DIR=${{ github.workspace }}/Build/caches \
-DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/Build/tools-install \
-DCMAKE_C_COMPILER=gcc-13 \
-DCMAKE_CXX_COMPILER=g++-13 \
-Dpackage=LagomTools

ninja -C tools-build install
ninja -C ${{ github.workspace }}/Build/tools-build install

cmake --preset Fuzzers_CI -B Build \
-DCMAKE_C_COMPILER=${{ steps.build-parameters.outputs.host_cc }} \
-DCMAKE_CXX_COMPILER=${{ steps.build-parameters.outputs.host_cxx }} \
-DCMAKE_PREFIX_PATH=${{ github.workspace }}/tool-install
-DCMAKE_PREFIX_PATH=${{ github.workspace }}/Build/tools-install

# === BUILD ===

Expand Down
11 changes: 3 additions & 8 deletions AK/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set(SOURCES
Assertions.cpp
Base64.cpp
CircularBuffer.cpp
ConstrainedStream.cpp
CountingStream.cpp
Expand Down Expand Up @@ -37,10 +38,6 @@ set(SOURCES
kmalloc.cpp
)

if (NOT LAGOM_TOOLS_ONLY)
list(APPEND SOURCES Base64.cpp)
endif()

serenity_lib(AK ak)

serenity_install_headers(AK)
Expand All @@ -60,7 +57,5 @@ else()
message(WARNING "Backtrace not found, stack traces will be unavailable")
endif()

if (NOT LAGOM_TOOLS_ONLY)
find_package(simdutf REQUIRED)
target_link_libraries(AK PRIVATE simdutf::simdutf)
endif()
find_package(simdutf REQUIRED)
target_link_libraries(AK PRIVATE simdutf::simdutf)
29 changes: 29 additions & 0 deletions AK/String.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,21 @@
* SPDX-License-Identifier: BSD-2-Clause
*/

#define AK_DONT_REPLACE_STD

#include <AK/Array.h>
#include <AK/Checked.h>
#include <AK/FlyString.h>
#include <AK/Format.h>
#include <AK/MemMem.h>
#include <AK/Stream.h>
#include <AK/String.h>
#include <AK/Utf16View.h>
#include <AK/Vector.h>
#include <stdlib.h>

#include <simdutf.h>

namespace AK {

String String::from_utf8_without_validation(ReadonlyBytes bytes)
Expand All @@ -39,6 +44,30 @@ ErrorOr<String> String::from_utf8(StringView view)
return result;
}

ErrorOr<String> String::from_utf16(Utf16View const& utf16)
{
if (!utf16.validate())
return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16");

String result;

auto utf8_length = simdutf::utf8_length_from_utf16(
reinterpret_cast<char16_t const*>(utf16.data()),
utf16.length_in_code_units());

TRY(result.replace_with_new_string(utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(
reinterpret_cast<char16_t const*>(utf16.data()),
utf16.length_in_code_units(),
reinterpret_cast<char*>(buffer.data()));
ASSERT(result == buffer.size());

return {};
}));

return result;
}

ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)
{
String result;
Expand Down
4 changes: 4 additions & 0 deletions AK/String.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,16 @@ class String : public Detail::StringBase {

// Creates a new String from a sequence of UTF-8 encoded code points.
static ErrorOr<String> from_utf8(StringView);

template<typename T>
requires(IsOneOf<RemoveCVReference<T>, ByteString, DeprecatedFlyString, FlyString, String>)
static ErrorOr<String> from_utf8(T&&) = delete;

[[nodiscard]] static String from_utf8_without_validation(ReadonlyBytes);

// Creates a new String from a sequence of UTF-16 encoded code points.
static ErrorOr<String> from_utf16(Utf16View const&);

// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
static ErrorOr<String> from_stream(Stream&, size_t byte_count);

Expand Down
103 changes: 66 additions & 37 deletions AK/Utf16View.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
/*
* Copyright (c) 2021-2023, Tim Flynn <[email protected]>
* Copyright (c) 2021-2024, Tim Flynn <[email protected]>
*
* SPDX-License-Identifier: BSD-2-Clause
*/

#define AK_DONT_REPLACE_STD

#include <AK/CharacterTypes.h>
#include <AK/Concepts.h>
#include <AK/StringBuilder.h>
Expand All @@ -12,6 +14,8 @@
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>

#include <simdutf.h>

namespace AK {

static constexpr u16 high_surrogate_min = 0xd800;
Expand All @@ -22,7 +26,7 @@ static constexpr u32 replacement_code_point = 0xfffd;
static constexpr u32 first_supplementary_plane_code_point = 0x10000;

template<OneOf<Utf8View, Utf32View> UtfViewType>
static ErrorOr<Utf16Data> to_utf16_impl(UtfViewType const& view)
static ErrorOr<Utf16Data> to_utf16_slow(UtfViewType const& view)
{
Utf16Data utf16_data;
TRY(utf16_data.try_ensure_capacity(view.length()));
Expand All @@ -35,17 +39,45 @@ static ErrorOr<Utf16Data> to_utf16_impl(UtfViewType const& view)

ErrorOr<Utf16Data> utf8_to_utf16(StringView utf8_view)
{
return to_utf16_impl(Utf8View { utf8_view });
return utf8_to_utf16(Utf8View { utf8_view });
}

ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view)
{
return to_utf16_impl(utf8_view);
// All callers want to allow lonely surrogates, which simdutf does not permit.
if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]]
return to_utf16_slow(utf8_view);

Utf16Data utf16_data;

TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(
reinterpret_cast<char const*>(utf8_view.bytes()),
utf8_view.byte_length())));

[[maybe_unused]] auto result = simdutf::convert_utf8_to_utf16(
trflynn89 marked this conversation as resolved.
Show resolved Hide resolved
reinterpret_cast<char const*>(utf8_view.bytes()),
utf8_view.byte_length(),
reinterpret_cast<char16_t*>(utf16_data.data()));
ASSERT(result == utf16_data.size());

return utf16_data;
}

ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view)
{
return to_utf16_impl(utf32_view);
Utf16Data utf16_data;

TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32(
reinterpret_cast<char32_t const*>(utf32_view.code_points()),
utf32_view.length())));

[[maybe_unused]] auto result = simdutf::convert_utf32_to_utf16(
reinterpret_cast<char32_t const*>(utf32_view.code_points()),
utf32_view.length(),
reinterpret_cast<char16_t*>(utf16_data.data()));
ASSERT(result == utf16_data.size());

return utf16_data;
}

ErrorOr<void> code_point_to_utf16(Utf16Data& string, u32 code_point)
Expand Down Expand Up @@ -88,30 +120,27 @@ ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invali

ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
{
if (allow_invalid_code_units == AllowInvalidCodeUnits::No)
return String::from_utf16(*this);

StringBuilder builder;

if (allow_invalid_code_units == AllowInvalidCodeUnits::Yes) {
for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
if (is_high_surrogate(*ptr)) {
auto const* next = ptr + 1;

if ((next < end_ptr()) && is_low_surrogate(*next)) {
auto code_point = decode_surrogate_pair(*ptr, *next);
TRY(builder.try_append_code_point(code_point));
++ptr;
continue;
}
}
for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
if (is_high_surrogate(*ptr)) {
auto const* next = ptr + 1;

TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
if ((next < end_ptr()) && is_low_surrogate(*next)) {
auto code_point = decode_surrogate_pair(*ptr, *next);
TRY(builder.try_append_code_point(code_point));
++ptr;
continue;
}
}
return builder.to_string_without_validation();
}

for (auto code_point : *this)
TRY(builder.try_append_code_point(code_point));
TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
}

return builder.to_string();
return builder.to_string_without_validation();
}

size_t Utf16View::length_in_code_points() const
Expand Down Expand Up @@ -233,27 +262,27 @@ bool Utf16View::starts_with(Utf16View const& needle) const
return true;
}

bool Utf16View::validate(size_t& valid_code_units) const
bool Utf16View::validate() const
{
valid_code_units = 0;

for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
if (is_high_surrogate(*ptr)) {
if ((++ptr >= end_ptr()) || !is_low_surrogate(*ptr))
return false;
++valid_code_units;
} else if (is_low_surrogate(*ptr)) {
return false;
}
return simdutf::validate_utf16(reinterpret_cast<char16_t const*>(m_code_units.data()), m_code_units.size());
}

++valid_code_units;
}
bool Utf16View::validate(size_t& valid_code_units) const
{
auto result = simdutf::validate_utf16_with_errors(reinterpret_cast<char16_t const*>(m_code_units.data()), m_code_units.size());
valid_code_units = result.count;

return true;
return result.error == simdutf::SUCCESS;
}

size_t Utf16View::calculate_length_in_code_points() const
{
// FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
// for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
// remove this branch.
if (validate()) [[likely]]
return simdutf::count_utf16(reinterpret_cast<char16_t const*>(m_code_units.data()), m_code_units.size());

size_t code_points = 0;
for ([[maybe_unused]] auto code_point : *this)
++code_points;
Expand Down
8 changes: 2 additions & 6 deletions AK/Utf16View.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, Tim Flynn <[email protected]>
* Copyright (c) 2021-2024, Tim Flynn <[email protected]>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
Expand Down Expand Up @@ -113,12 +113,8 @@ class Utf16View {

bool starts_with(Utf16View const&) const;

bool validate() const;
bool validate(size_t& valid_code_units) const;
bool validate() const
{
size_t valid_code_units;
return validate(valid_code_units);
}

bool equals_ignoring_case(Utf16View const&) const;

Expand Down
28 changes: 28 additions & 0 deletions AK/Utf8View.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@
* SPDX-License-Identifier: BSD-2-Clause
*/

#define AK_DONT_REPLACE_STD

#include <AK/Assertions.h>
#include <AK/Debug.h>
#include <AK/Format.h>
#include <AK/Utf8View.h>

#include <simdutf.h>

namespace AK {

Utf8CodePointIterator Utf8View::iterator_at_byte_offset(size_t byte_offset) const
Expand Down Expand Up @@ -72,6 +76,12 @@ Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_

size_t Utf8View::calculate_length() const
{
// FIXME: simdutf's code point length method assumes valid UTF-8, whereas Utf8View uses U+FFFD as a replacement
// for invalid code points. If we change Utf8View to only accept valid encodings as an invariant, we can
// remove this branch.
if (validate()) [[likely]]
return simdutf::count_utf8(m_string.characters_without_null_termination(), m_string.length());

size_t length = 0;

for (size_t i = 0; i < m_string.length(); ++length) {
Expand Down Expand Up @@ -143,6 +153,24 @@ Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const
return substring_view(substring_start, substring_length);
}

bool Utf8View::validate(size_t& valid_bytes, AllowSurrogates allow_surrogates) const
{
auto result = simdutf::validate_utf8_with_errors(m_string.characters_without_null_termination(), m_string.length());
valid_bytes = result.count;

if (result.error == simdutf::SURROGATE && allow_surrogates == AllowSurrogates::Yes) {
valid_bytes += 3; // All surrogates have a UTF-8 byte length of 3.

size_t substring_valid_bytes = 0;
auto is_valid = substring_view(valid_bytes).validate(substring_valid_bytes, allow_surrogates);

valid_bytes += substring_valid_bytes;
return is_valid;
}

return result.error == simdutf::SUCCESS;
}

Utf8CodePointIterator& Utf8CodePointIterator::operator++()
{
VERIFY(m_length > 0);
Expand Down
Loading