Skip to content

Commit

Permalink
src: use simdutf for converting externalized builtins to UTF-16
Browse files Browse the repository at this point in the history
Remove the dependency on ICU for this part, as well as the
hacky way of converting embedder main sources to UTF-8 via
V8 APIs. Allow `UnionBytes` to own the memory its pointing
to in order to simplify the code on the `BuiltinLoader` side.

PR-URL: nodejs#46119
Reviewed-By: Yagiz Nizipli <[email protected]>
Reviewed-By: James M Snell <[email protected]>
Reviewed-By: Rafael Gonzaga <[email protected]>
Reviewed-By: Darshan Sen <[email protected]>
  • Loading branch information
addaleax authored Jan 10, 2023
1 parent 757c104 commit 21fb98e
Show file tree
Hide file tree
Showing 10 changed files with 106 additions and 105 deletions.
6 changes: 1 addition & 5 deletions configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2024,11 +2024,7 @@ def make_bin_override():
for builtin in shareable_builtins:
builtin_id = 'node_shared_builtin_' + builtin.replace('/', '_') + '_path'
if getattr(options, builtin_id):
if options.with_intl == 'none':
option_name = '--shared-builtin-' + builtin + '-path'
error(option_name + ' is incompatible with --with-intl=none' )
else:
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
else:
output['variables']['node_builtin_shareable_builtins'] += [shareable_builtins[builtin]]

Expand Down
1 change: 1 addition & 0 deletions node.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,7 @@
'deps/googletest/googletest.gyp:gtest_main',
'deps/histogram/histogram.gyp:histogram',
'deps/uvwasi/uvwasi.gyp:uvwasi',
'deps/simdutf/simdutf.gyp:simdutf',
],

'includes': [
Expand Down
15 changes: 2 additions & 13 deletions src/api/environment.cc
Original file line number Diff line number Diff line change
Expand Up @@ -467,21 +467,10 @@ MaybeLocal<Value> LoadEnvironment(
Environment* env,
const char* main_script_source_utf8) {
CHECK_NOT_NULL(main_script_source_utf8);
Isolate* isolate = env->isolate();
return LoadEnvironment(
env,
[&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
// This is a slightly hacky way to convert UTF-8 to UTF-16.
Local<String> str =
String::NewFromUtf8(isolate,
main_script_source_utf8).ToLocalChecked();
auto main_utf16 = std::make_unique<String::Value>(isolate, str);

// TODO(addaleax): Avoid having a global table for all scripts.
env, [&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
std::string name = "embedder_main_" + std::to_string(env->thread_id());
builtins::BuiltinLoader::Add(
name.c_str(), UnionBytes(**main_utf16, main_utf16->length()));
env->set_main_utf16(std::move(main_utf16));
builtins::BuiltinLoader::Add(name.c_str(), main_script_source_utf8);
Realm* realm = env->principal_realm();

return realm->ExecuteBootstrapper(name.c_str());
Expand Down
5 changes: 0 additions & 5 deletions src/env-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -805,11 +805,6 @@ void Environment::RemoveCleanupHook(CleanupQueue::Callback fn, void* arg) {
cleanup_queue_.Remove(fn, arg);
}

void Environment::set_main_utf16(std::unique_ptr<v8::String::Value> str) {
CHECK(!main_utf16_);
main_utf16_ = std::move(str);
}

void Environment::set_process_exit_handler(
std::function<void(Environment*, ExitCode)>&& handler) {
process_exit_handler_ = std::move(handler);
Expand Down
6 changes: 0 additions & 6 deletions src/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,6 @@ class Environment : public MemoryRetainer {

#endif // HAVE_INSPECTOR

inline void set_main_utf16(std::unique_ptr<v8::String::Value>);
inline void set_process_exit_handler(
std::function<void(Environment*, ExitCode)>&& handler);

Expand Down Expand Up @@ -1144,11 +1143,6 @@ class Environment : public MemoryRetainer {

std::unique_ptr<Realm> principal_realm_ = nullptr;

// Keeps the main script source alive is one was passed to LoadEnvironment().
// We should probably find a way to just use plain `v8::String`s created from
// the source passed to LoadEnvironment() directly instead.
std::unique_ptr<v8::String::Value> main_utf16_;

// Used by allocate_managed_buffer() and release_managed_buffer() to keep
// track of the BackingStore for a given pointer.
std::unordered_map<char*, std::unique_ptr<v8::BackingStore>>
Expand Down
26 changes: 14 additions & 12 deletions src/node_builtins.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "env-inl.h"
#include "node_external_reference.h"
#include "node_internals.h"
#include "simdutf.h"
#include "util-inl.h"

namespace node {
Expand Down Expand Up @@ -35,7 +36,6 @@ BuiltinLoader BuiltinLoader::instance_;

BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
LoadJavaScriptSource();
#if defined(NODE_HAVE_I18N_SUPPORT)
#ifdef NODE_SHARED_BUILTIN_CJS_MODULE_LEXER_LEXER_PATH
AddExternalizedBuiltin(
"internal/deps/cjs-module-lexer/lexer",
Expand All @@ -52,7 +52,6 @@ BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
AddExternalizedBuiltin("internal/deps/undici/undici",
STRINGIFY(NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH));
#endif // NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH
#endif // NODE_HAVE_I18N_SUPPORT
}

BuiltinLoader* BuiltinLoader::GetInstance() {
Expand Down Expand Up @@ -240,7 +239,6 @@ MaybeLocal<String> BuiltinLoader::LoadBuiltinSource(Isolate* isolate,
#endif // NODE_BUILTIN_MODULES_PATH
}

#if defined(NODE_HAVE_I18N_SUPPORT)
void BuiltinLoader::AddExternalizedBuiltin(const char* id,
const char* filename) {
std::string source;
Expand All @@ -252,16 +250,20 @@ void BuiltinLoader::AddExternalizedBuiltin(const char* id,
return;
}

icu::UnicodeString utf16 = icu::UnicodeString::fromUTF8(
icu::StringPiece(source.data(), source.length()));
auto source_utf16 = std::make_unique<icu::UnicodeString>(utf16);
Add(id,
UnionBytes(reinterpret_cast<const uint16_t*>((*source_utf16).getBuffer()),
utf16.length()));
// keep source bytes for builtin alive while BuiltinLoader exists
GetInstance()->externalized_source_bytes_.push_back(std::move(source_utf16));
Add(id, source);
}

bool BuiltinLoader::Add(const char* id, std::string_view utf8source) {
size_t expected_u16_length =
simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length());
auto out = std::make_shared<std::vector<uint16_t>>(expected_u16_length);
size_t u16_length = simdutf::convert_utf8_to_utf16le(
utf8source.data(),
utf8source.length(),
reinterpret_cast<char16_t*>(out->data()));
out->resize(u16_length);
return Add(id, UnionBytes(out));
}
#endif // NODE_HAVE_I18N_SUPPORT

// Returns Local<Function> of the compiled module if return_code_cache
// is false (we are only compiling the function).
Expand Down
9 changes: 1 addition & 8 deletions src/node_builtins.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
#include <memory>
#include <set>
#include <string>
#if defined(NODE_HAVE_I18N_SUPPORT)
#include <unicode/unistr.h>
#endif // NODE_HAVE_I18N_SUPPORT
#include <vector>
#include "node_mutex.h"
#include "node_union_bytes.h"
Expand Down Expand Up @@ -71,6 +68,7 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
static v8::Local<v8::String> GetConfigString(v8::Isolate* isolate);
static bool Exists(const char* id);
static bool Add(const char* id, const UnionBytes& source);
static bool Add(const char* id, std::string_view utf8source);

static bool CompileAllBuiltins(v8::Local<v8::Context> context);
static void RefreshCodeCache(const std::vector<CodeCacheInfo>& in);
Expand Down Expand Up @@ -136,18 +134,13 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
static void HasCachedBuiltins(
const v8::FunctionCallbackInfo<v8::Value>& args);

#if defined(NODE_HAVE_I18N_SUPPORT)
static void AddExternalizedBuiltin(const char* id, const char* filename);
#endif // NODE_HAVE_I18N_SUPPORT

static BuiltinLoader instance_;
BuiltinCategories builtin_categories_;
BuiltinSourceMap source_;
BuiltinCodeCacheMap code_cache_;
UnionBytes config_;
#if defined(NODE_HAVE_I18N_SUPPORT)
std::list<std::unique_ptr<icu::UnicodeString>> externalized_source_bytes_;
#endif // NODE_HAVE_I18N_SUPPORT

// Used to synchronize access to the code cache map
Mutex code_cache_mutex_;
Expand Down
64 changes: 9 additions & 55 deletions src/node_union_bytes.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,57 +11,20 @@

namespace node {

class NonOwningExternalOneByteResource
: public v8::String::ExternalOneByteStringResource {
public:
explicit NonOwningExternalOneByteResource(const uint8_t* data, size_t length)
: data_(data), length_(length) {}
~NonOwningExternalOneByteResource() override = default;

const char* data() const override {
return reinterpret_cast<const char*>(data_);
}
size_t length() const override { return length_; }

NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) =
delete;
NonOwningExternalOneByteResource& operator=(
const NonOwningExternalOneByteResource&) = delete;

private:
const uint8_t* data_;
size_t length_;
};

class NonOwningExternalTwoByteResource
: public v8::String::ExternalStringResource {
public:
explicit NonOwningExternalTwoByteResource(const uint16_t* data, size_t length)
: data_(data), length_(length) {}
~NonOwningExternalTwoByteResource() override = default;

const uint16_t* data() const override { return data_; }
size_t length() const override { return length_; }

NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) =
delete;
NonOwningExternalTwoByteResource& operator=(
const NonOwningExternalTwoByteResource&) = delete;

private:
const uint16_t* data_;
size_t length_;
};

// Similar to a v8::String, but it's independent from Isolates
// and can be materialized in Isolates as external Strings
// via ToStringChecked. The data pointers are owned by the caller.
// via ToStringChecked.
class UnionBytes {
public:
UnionBytes(const uint16_t* data, size_t length)
: one_bytes_(nullptr), two_bytes_(data), length_(length) {}
UnionBytes(const uint8_t* data, size_t length)
: one_bytes_(data), two_bytes_(nullptr), length_(length) {}
template <typename T> // T = uint8_t or uint16_t
explicit UnionBytes(std::shared_ptr<std::vector</*const*/ T>> data)
: UnionBytes(data->data(), data->size()) {
owning_ptr_ = data;
}

UnionBytes(const UnionBytes&) = default;
UnionBytes& operator=(const UnionBytes&) = default;
Expand All @@ -77,23 +40,14 @@ class UnionBytes {
CHECK_NOT_NULL(one_bytes_);
return one_bytes_;
}
v8::Local<v8::String> ToStringChecked(v8::Isolate* isolate) const {
if (is_one_byte()) {
NonOwningExternalOneByteResource* source =
new NonOwningExternalOneByteResource(one_bytes_data(), length_);
return v8::String::NewExternalOneByte(isolate, source).ToLocalChecked();
} else {
NonOwningExternalTwoByteResource* source =
new NonOwningExternalTwoByteResource(two_bytes_data(), length_);
return v8::String::NewExternalTwoByte(isolate, source).ToLocalChecked();
}
}
size_t length() { return length_; }
v8::Local<v8::String> ToStringChecked(v8::Isolate* isolate) const;
size_t length() const { return length_; }

private:
const uint8_t* one_bytes_;
const uint16_t* two_bytes_;
size_t length_;
std::shared_ptr<void> owning_ptr_;
};

} // namespace node
Expand Down
62 changes: 62 additions & 0 deletions src/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -515,4 +515,66 @@ void SetConstructorFunction(Isolate* isolate,
that->Set(name, tmpl);
}

namespace {

class NonOwningExternalOneByteResource
: public v8::String::ExternalOneByteStringResource {
public:
explicit NonOwningExternalOneByteResource(const UnionBytes& source)
: source_(source) {}
~NonOwningExternalOneByteResource() override = default;

const char* data() const override {
return reinterpret_cast<const char*>(source_.one_bytes_data());
}
size_t length() const override { return source_.length(); }

NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) =
delete;
NonOwningExternalOneByteResource& operator=(
const NonOwningExternalOneByteResource&) = delete;

private:
const UnionBytes source_;
};

class NonOwningExternalTwoByteResource
: public v8::String::ExternalStringResource {
public:
explicit NonOwningExternalTwoByteResource(const UnionBytes& source)
: source_(source) {}
~NonOwningExternalTwoByteResource() override = default;

const uint16_t* data() const override { return source_.two_bytes_data(); }
size_t length() const override { return source_.length(); }

NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) =
delete;
NonOwningExternalTwoByteResource& operator=(
const NonOwningExternalTwoByteResource&) = delete;

private:
const UnionBytes source_;
};

} // anonymous namespace

Local<String> UnionBytes::ToStringChecked(Isolate* isolate) const {
if (UNLIKELY(length() == 0)) {
// V8 requires non-null data pointers for empty external strings,
// but we don't guarantee that. Solve this by not creating an
// external string at all in that case.
return String::Empty(isolate);
}
if (is_one_byte()) {
NonOwningExternalOneByteResource* source =
new NonOwningExternalOneByteResource(*this);
return String::NewExternalOneByte(isolate, source).ToLocalChecked();
} else {
NonOwningExternalTwoByteResource* source =
new NonOwningExternalTwoByteResource(*this);
return String::NewExternalTwoByte(isolate, source).ToLocalChecked();
}
}

} // namespace node
17 changes: 16 additions & 1 deletion test/cctest/test_util.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#include "util-inl.h"
#include "debug_utils-inl.h"
#include "env-inl.h"
#include "gtest/gtest.h"
#include "simdutf.h"
#include "util-inl.h"

using node::Calloc;
using node::Malloc;
Expand Down Expand Up @@ -298,3 +299,17 @@ TEST(UtilTest, SPrintF) {
const std::string with_zero = std::string("a") + '\0' + 'b';
EXPECT_EQ(SPrintF("%s", with_zero), with_zero);
}

TEST(UtilTest, SimdutfEndiannessDoesNotMeanEndianness) {
// In simdutf, "LE" does *not* refer to Little Endian, it refers
// to 16-byte code units that are stored using *host* endianness.
// This is weird and confusing naming, and so we add this assertion
// here to verify that this is actually the case (so that CI tells
// us if it changed, because for most people Little Endian is
// host endianness, so locally everything would work fine).
const char utf8source[] = "\xe7\x8c\xab";
char16_t u16output;
size_t u16len = simdutf::convert_utf8_to_utf16le(utf8source, 3, &u16output);
EXPECT_EQ(u16len, 1u);
EXPECT_EQ(u16output, 0x732B);
}

0 comments on commit 21fb98e

Please sign in to comment.