LadybirdBrowser · awesomekling · Jul 18, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 16, 2024
diff --git a/.github/workflows/lagom-template.yml b/.github/workflows/lagom-template.yml
@@ -104,21 +104,20 @@ jobs:
         run: |
           set -e
 
-          cmake -GNinja -S Meta/Lagom -B ${{ github.workspace }}/tools-build \
+          cmake --preset=CI -S Meta/Lagom -B ${{ github.workspace }}/Build/tools-build \
             -DLAGOM_TOOLS_ONLY=ON \
             -DINSTALL_LAGOM_TOOLS=ON \
-            -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/tool-install \
-            -DSERENITY_CACHE_DIR=${{ github.workspace }}/Build/caches \
+            -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/Build/tools-install \
             -DCMAKE_C_COMPILER=gcc-13 \
             -DCMAKE_CXX_COMPILER=g++-13 \
             -Dpackage=LagomTools
 
-          ninja -C tools-build install
+          ninja -C ${{ github.workspace }}/Build/tools-build install
 
           cmake --preset Fuzzers_CI -B Build \
             -DCMAKE_C_COMPILER=${{ steps.build-parameters.outputs.host_cc }} \
             -DCMAKE_CXX_COMPILER=${{ steps.build-parameters.outputs.host_cxx }} \
-            -DCMAKE_PREFIX_PATH=${{ github.workspace }}/tool-install
+            -DCMAKE_PREFIX_PATH=${{ github.workspace }}/Build/tools-install
 
       # === BUILD ===
 

diff --git a/AK/CMakeLists.txt b/AK/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(SOURCES
     Assertions.cpp
+    Base64.cpp
     CircularBuffer.cpp
     ConstrainedStream.cpp
     CountingStream.cpp
@@ -37,10 +38,6 @@ set(SOURCES
     kmalloc.cpp
 )
 
-if (NOT LAGOM_TOOLS_ONLY)
-    list(APPEND SOURCES Base64.cpp)
-endif()
-
 serenity_lib(AK ak)
 
 serenity_install_headers(AK)
@@ -60,7 +57,5 @@ else()
     message(WARNING "Backtrace not found, stack traces will be unavailable")
 endif()
 
-if (NOT LAGOM_TOOLS_ONLY)
-    find_package(simdutf REQUIRED)
-    target_link_libraries(AK PRIVATE simdutf::simdutf)
-endif()
+find_package(simdutf REQUIRED)
+target_link_libraries(AK PRIVATE simdutf::simdutf)
diff --git a/AK/String.cpp b/AK/String.cpp
@@ -4,16 +4,21 @@
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#define AK_DONT_REPLACE_STD
+
 #include <AK/Array.h>
 #include <AK/Checked.h>
 #include <AK/FlyString.h>
 #include <AK/Format.h>
 #include <AK/MemMem.h>
 #include <AK/Stream.h>
 #include <AK/String.h>
+#include <AK/Utf16View.h>
 #include <AK/Vector.h>
 #include <stdlib.h>
 
+#include <simdutf.h>
+
 namespace AK {
 
 String String::from_utf8_without_validation(ReadonlyBytes bytes)
@@ -39,6 +44,30 @@ ErrorOr<String> String::from_utf8(StringView view)
     return result;
 }
 
+ErrorOr<String> String::from_utf16(Utf16View const& utf16)
+{
+    if (!utf16.validate())
+        return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16");
+
+    String result;
+
+    auto utf8_length = simdutf::utf8_length_from_utf16(
+        reinterpret_cast<char16_t const*>(utf16.data()),
+        utf16.length_in_code_units());
+
+    TRY(result.replace_with_new_string(utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
+        [[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(
+            reinterpret_cast<char16_t const*>(utf16.data()),
+            utf16.length_in_code_units(),
+            reinterpret_cast<char*>(buffer.data()));
+        ASSERT(result == buffer.size());
+
+        return {};
+    }));
+
+    return result;
+}
+
 ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)
 {
     String result;

diff --git a/AK/String.h b/AK/String.h
@@ -50,12 +50,16 @@ class String : public Detail::StringBase {
 
     // Creates a new String from a sequence of UTF-8 encoded code points.
     static ErrorOr<String> from_utf8(StringView);
+
     template<typename T>
     requires(IsOneOf<RemoveCVReference<T>, ByteString, DeprecatedFlyString, FlyString, String>)
     static ErrorOr<String> from_utf8(T&&) = delete;
 
     [[nodiscard]] static String from_utf8_without_validation(ReadonlyBytes);
 
+    // Creates a new String from a sequence of UTF-16 encoded code points.
+    static ErrorOr<String> from_utf16(Utf16View const&);
+
     // Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
     static ErrorOr<String> from_stream(Stream&, size_t byte_count);
 

diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp
@@ -1,9 +1,11 @@
 /*
- * Copyright (c) 2021-2023, Tim Flynn <[email protected]>
+ * Copyright (c) 2021-2024, Tim Flynn <[email protected]>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#define AK_DONT_REPLACE_STD
+
 #include <AK/CharacterTypes.h>
 #include <AK/Concepts.h>
 #include <AK/StringBuilder.h>
@@ -12,6 +14,8 @@
 #include <AK/Utf32View.h>
 #include <AK/Utf8View.h>
 
+#include <simdutf.h>
+
 namespace AK {
 
 static constexpr u16 high_surrogate_min = 0xd800;
@@ -22,7 +26,7 @@ static constexpr u32 replacement_code_point = 0xfffd;
 static constexpr u32 first_supplementary_plane_code_point = 0x10000;
 
 template<OneOf<Utf8View, Utf32View> UtfViewType>
-static ErrorOr<Utf16Data> to_utf16_impl(UtfViewType const& view)
+static ErrorOr<Utf16Data> to_utf16_slow(UtfViewType const& view)
 {
     Utf16Data utf16_data;
     TRY(utf16_data.try_ensure_capacity(view.length()));
@@ -35,17 +39,45 @@ static ErrorOr<Utf16Data> to_utf16_impl(UtfViewType const& view)
 
 ErrorOr<Utf16Data> utf8_to_utf16(StringView utf8_view)
 {
-    return to_utf16_impl(Utf8View { utf8_view });
+    return utf8_to_utf16(Utf8View { utf8_view });
 }
 
 ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view)
 {
-    return to_utf16_impl(utf8_view);
+    // All callers want to allow lonely surrogates, which simdutf does not permit.
+    if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]]
+        return to_utf16_slow(utf8_view);
+
+    Utf16Data utf16_data;
+
+    TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(
+        reinterpret_cast<char const*>(utf8_view.bytes()),
+        utf8_view.byte_length())));
+
+    [[maybe_unused]] auto result = simdutf::convert_utf8_to_utf16(
+        reinterpret_cast<char const*>(utf8_view.bytes()),
+        utf8_view.byte_length(),
+        reinterpret_cast<char16_t*>(utf16_data.data()));
+    ASSERT(result == utf16_data.size());
+
+    return utf16_data;
 }
 
 ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view)
 {
-    return to_utf16_impl(utf32_view);
+    Utf16Data utf16_data;
+
+    TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32(
+        reinterpret_cast<char32_t const*>(utf32_view.code_points()),
+        utf32_view.length())));
+
+    [[maybe_unused]] auto result = simdutf::convert_utf32_to_utf16(
+        reinterpret_cast<char32_t const*>(utf32_view.code_points()),
+        utf32_view.length(),
+        reinterpret_cast<char16_t*>(utf16_data.data()));
+    ASSERT(result == utf16_data.size());
+
+    return utf16_data;
 }
 
 ErrorOr<void> code_point_to_utf16(Utf16Data& string, u32 code_point)
@@ -88,30 +120,27 @@ ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invali
 
 ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
 {
+    if (allow_invalid_code_units == AllowInvalidCodeUnits::No)
+        return String::from_utf16(*this);
+
     StringBuilder builder;
 
-    if (allow_invalid_code_units == AllowInvalidCodeUnits::Yes) {
-        for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
-            if (is_high_surrogate(*ptr)) {
-                auto const* next = ptr + 1;
-
-                if ((next < end_ptr()) && is_low_surrogate(*next)) {
-                    auto code_point = decode_surrogate_pair(*ptr, *next);
-                    TRY(builder.try_append_code_point(code_point));
-                    ++ptr;
-                    continue;
-                }
-            }
+    for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
+        if (is_high_surrogate(*ptr)) {
+            auto const* next = ptr + 1;
 
-            TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
+            if ((next < end_ptr()) && is_low_surrogate(*next)) {
+                auto code_point = decode_surrogate_pair(*ptr, *next);
+                TRY(builder.try_append_code_point(code_point));
+                ++ptr;
+                continue;
+            }
         }
-        return builder.to_string_without_validation();
-    }
 
-    for (auto code_point : *this)
-        TRY(builder.try_append_code_point(code_point));
+        TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
+    }
 
-    return builder.to_string();
+    return builder.to_string_without_validation();
 }
 
 size_t Utf16View::length_in_code_points() const
@@ -233,27 +262,27 @@ bool Utf16View::starts_with(Utf16View const& needle) const
     return true;
 }
 
-bool Utf16View::validate(size_t& valid_code_units) const
+bool Utf16View::validate() const
 {
-    valid_code_units = 0;
-
-    for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
-        if (is_high_surrogate(*ptr)) {
-            if ((++ptr >= end_ptr()) || !is_low_surrogate(*ptr))
-                return false;
-            ++valid_code_units;
-        } else if (is_low_surrogate(*ptr)) {
-            return false;
-        }
+    return simdutf::validate_utf16(reinterpret_cast<char16_t const*>(m_code_units.data()), m_code_units.size());
+}
 
-        ++valid_code_units;
-    }
+bool Utf16View::validate(size_t& valid_code_units) const
+{
+    auto result = simdutf::validate_utf16_with_errors(reinterpret_cast<char16_t const*>(m_code_units.data()), m_code_units.size());
+    valid_code_units = result.count;
 
-    return true;
+    return result.error == simdutf::SUCCESS;
 }
 
 size_t Utf16View::calculate_length_in_code_points() const
 {
+    // FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
+    //        for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
+    //        remove this branch.
+    if (validate()) [[likely]]
+        return simdutf::count_utf16(reinterpret_cast<char16_t const*>(m_code_units.data()), m_code_units.size());
+
     size_t code_points = 0;
     for ([[maybe_unused]] auto code_point : *this)
         ++code_points;

diff --git a/AK/Utf16View.h b/AK/Utf16View.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, Tim Flynn <[email protected]>
+ * Copyright (c) 2021-2024, Tim Flynn <[email protected]>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
@@ -113,12 +113,8 @@ class Utf16View {
 
     bool starts_with(Utf16View const&) const;
 
+    bool validate() const;
     bool validate(size_t& valid_code_units) const;
-    bool validate() const
-    {
-        size_t valid_code_units;
-        return validate(valid_code_units);
-    }
 
     bool equals_ignoring_case(Utf16View const&) const;
 

diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp
@@ -5,11 +5,15 @@
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#define AK_DONT_REPLACE_STD
+
 #include <AK/Assertions.h>
 #include <AK/Debug.h>
 #include <AK/Format.h>
 #include <AK/Utf8View.h>
 
+#include <simdutf.h>
+
 namespace AK {
 
 Utf8CodePointIterator Utf8View::iterator_at_byte_offset(size_t byte_offset) const
@@ -72,6 +76,12 @@ Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_
 
 size_t Utf8View::calculate_length() const
 {
+    // FIXME: simdutf's code point length method assumes valid UTF-8, whereas Utf8View uses U+FFFD as a replacement
+    //        for invalid code points. If we change Utf8View to only accept valid encodings as an invariant, we can
+    //        remove this branch.
+    if (validate()) [[likely]]
+        return simdutf::count_utf8(m_string.characters_without_null_termination(), m_string.length());
+
     size_t length = 0;
 
     for (size_t i = 0; i < m_string.length(); ++length) {
@@ -143,6 +153,24 @@ Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const
     return substring_view(substring_start, substring_length);
 }
 
+bool Utf8View::validate(size_t& valid_bytes, AllowSurrogates allow_surrogates) const
+{
+    auto result = simdutf::validate_utf8_with_errors(m_string.characters_without_null_termination(), m_string.length());
+    valid_bytes = result.count;
+
+    if (result.error == simdutf::SURROGATE && allow_surrogates == AllowSurrogates::Yes) {
+        valid_bytes += 3; // All surrogates have a UTF-8 byte length of 3.
+
+        size_t substring_valid_bytes = 0;
+        auto is_valid = substring_view(valid_bytes).validate(substring_valid_bytes, allow_surrogates);
+
+        valid_bytes += substring_valid_bytes;
+        return is_valid;
+    }
+
+    return result.error == simdutf::SUCCESS;
+}
+
 Utf8CodePointIterator& Utf8CodePointIterator::operator++()
 {
     VERIFY(m_length > 0);