From 382def940b2ec44edd0ed450a36b92badca5e9d4 Mon Sep 17 00:00:00 2001 From: Khafra Date: Sun, 6 Oct 2024 23:55:55 -0400 Subject: [PATCH 1/2] implement data url parsing Co-authored-by: Yagiz Nizipli --- include/ada.h | 1 + include/ada/ada_data_url.h | 33 ++++++++++ include/ada/serializers.h | 2 + src/ada.cpp | 3 +- src/ada_data_url.cpp | 132 +++++++++++++++++++++++++++++++++++++ src/serializers.cpp | 18 +++++ tests/basic_tests.cpp | 7 ++ 7 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 include/ada/ada_data_url.h create mode 100644 src/ada_data_url.cpp diff --git a/include/ada.h b/include/ada.h index c5d0946ec..04eacb168 100644 --- a/include/ada.h +++ b/include/ada.h @@ -9,6 +9,7 @@ #include "ada/character_sets-inl.h" #include "ada/checkers-inl.h" #include "ada/common_defs.h" +#include "ada/ada_data_url.h" #include "ada/log.h" #include "ada/encoding_type.h" #include "ada/helpers.h" diff --git a/include/ada/ada_data_url.h b/include/ada/ada_data_url.h new file mode 100644 index 000000000..8f06b780e --- /dev/null +++ b/include/ada/ada_data_url.h @@ -0,0 +1,33 @@ +#ifndef ADA_DATA_URL_H +#define ADA_DATA_URL_H + +#include + +namespace ada::data_url { +// https://fetch.spec.whatwg.org/#data-url-struct +struct data_url { + data_url() = default; + data_url(const data_url &m) = default; + data_url(data_url &&m) noexcept = default; + data_url &operator=(data_url &&m) noexcept = default; + data_url &operator=(const data_url &m) = default; + ~data_url() = default; + + bool is_valid = true; + std::string body{}; + std::string essence{}; +}; + +ada::data_url::data_url parse_data_url(std::string_view data_url); + +std::string collect_sequence_of_code_points(char c, const std::string& input, size_t& position); + +bool isASCIIWhiteSpace(char c); + +std::string removeASCIIWhiteSpace(const std::string& input, bool leading, bool trailing); + +static constexpr bool is_base64(std::string_view input); + +} + +#endif // ADA_DATA_URL_H diff --git a/include/ada/serializers.h b/include/ada/serializers.h index d8e0d3d6f..b2e27fbb1 100644 --- a/include/ada/serializers.h +++ b/include/ada/serializers.h @@ -40,6 +40,8 @@ std::string ipv6(const std::array& address) noexcept; */ std::string ipv4(uint64_t address) noexcept; +std::string url_serializer(const ada::url& url, bool excludeFragment) noexcept; + } // namespace ada::serializers #endif // ADA_SERIALIZERS_H diff --git a/src/ada.cpp b/src/ada.cpp index 164f37d74..99109e380 100644 --- a/src/ada.cpp +++ b/src/ada.cpp @@ -10,4 +10,5 @@ #include "parser.cpp" #include "url_components.cpp" #include "url_aggregator.cpp" -#include "ada_c.cpp" \ No newline at end of file +#include "ada_c.cpp" +#include "ada_data_url.cpp" \ No newline at end of file diff --git a/src/ada_data_url.cpp b/src/ada_data_url.cpp new file mode 100644 index 000000000..2a1fc1881 --- /dev/null +++ b/src/ada_data_url.cpp @@ -0,0 +1,132 @@ +#include +#include + +#include "ada.h" + +namespace ada::data_url { + +ada::data_url::data_url parse_data_url(std::string_view data_url) { + auto out = ada::data_url::data_url(); + + auto url = ada::parse(data_url, nullptr); + + // 1. Assert: dataURL’s scheme is "data". + if (!url || url->get_protocol() != "data:") { + out.is_valid = false; + return out; + } + + // 2. Let input be the result of running the URL serializer on dataURL with exclude + // fragment set to true. + url->set_hash({}); + auto input = url->get_href(); + + // 3. Remove the leading "data:" from input. + input.erase(0, 5); + + // 4. Let position point at the start of input. + size_t position = 0; + + // 5. Let mimeType be the result of collecting a sequence of code points that are + // not equal to U+002C (,), given position. + auto mimetype = collect_sequence_of_code_points(',', input, position); + auto mimetype_length = mimetype.length(); + + // 6. Strip leading and trailing ASCII whitespace from mimeType. + mimetype = removeASCIIWhiteSpace(mimetype, true, true); + + // 7. If position is past the end of input, then return failure. + if (position >= input.length()) { + out.is_valid = false; + return out; + } + + // 8. Advance position by 1. + position++; + + // 9. Let encodedBody be the remainder of input. + std::string encoded_body = input.substr(mimetype_length + 1); + + // 10. Let body be the percent-decoding of encodedBody. + encoded_body = ada::unicode::percent_decode(encoded_body, encoded_body.find('%')); + + // 11. If mimeType ends with U+003B (;), followed by zero or more U+0020 SPACE, + // followed by an ASCII case-insensitive match for "base64", then: + size_t last_semi_colon = input.find_last_of(';'); + + if (last_semi_colon != std::string::npos) { + size_t next_non_space = input.find_first_not_of(' ', last_semi_colon); + + out.essence = mimetype.substr(0, last_semi_colon); + + if (is_base64(mimetype)) { + + // 11.1. Let stringBody be the isomorphic decode of body. + auto string_body = encoded_body; + + // 11.2. Set body to the forgiving-base64 decode of stringBody. + // 11.3. If body is failure, then return failure. + // TODO + out.body = string_body; + + // 11.4. Remove the last 6 code points from mimeType. + // 11.5. Remove trailing U+0020 SPACE code points from mimeType, if any. + // 11.6. Remove the last U+003B (;) from mimeType. + mimetype.erase(last_semi_colon); + } + } + + // 12. If mimeType starts with ";", then prepend "text/plain" to mimeType. + if (mimetype.starts_with(';')) { + mimetype = "text/plain" + mimetype; + } + + return out; +} + +std::string collect_sequence_of_code_points(char c, const std::string& input, size_t& position) { + auto idx = input.find_first_of(c, position); + size_t start = position; + + if (idx == std::string::npos) { + position = reinterpret_cast(input.length()); + return input.substr(start); + } + + position = reinterpret_cast(idx); + return input.substr(start, position); +} + +std::string removeASCIIWhiteSpace(const std::string& input, bool leading, bool trailing) { + size_t lead = 0; + size_t trail = input.length(); + + if (leading) { + while (lead < input.length() && isASCIIWhiteSpace(input[lead])) + lead++; + } + + if (trailing) { + while (trail > 0 && isASCIIWhiteSpace(input[trail])) + trail--; + } + + return input.substr(lead, trail); +} + +bool isASCIIWhiteSpace(char c) { + return c == '\r' || c == '\n' || c == '\t' || c == '\f'; +} + +static constexpr bool is_base64(std::string_view input) { + auto last_idx = input.find_last_of(';'); + if (last_idx != std::string_view::npos) { + // TODO(@anonrig): Trim input + auto res = input.substr(last_idx + 1); + return res.size() == 6 && (res[0] | 0x20) == 'b' && (res[1] | 0x20) == 'a' && + (res[2] | 0x20) == 's' && (res[3] | 0x20) == 'e' && (res[4] == '6') && (res[5] == '4'); + } + return false; +} + +} diff --git a/src/serializers.cpp b/src/serializers.cpp index 91be39ce1..cc9a9974b 100644 --- a/src/serializers.cpp +++ b/src/serializers.cpp @@ -77,4 +77,22 @@ std::string ipv4(const uint64_t address) noexcept { return output; } +std::string url_serializer(const ada::url& url, bool excludeFragment) noexcept { + if (!excludeFragment) { + return url.get_href(); + } + + std::string href = url.get_href(); + size_t hashLength = url.has_hash() ? url.get_hash().size() : 0; + + std::string serialized = hashLength == 0 ? href : href.substr(0, href.length() - hashLength); + + if (hashLength == 0 && href.ends_with('#')) { + serialized.pop_back(); + return serialized; + } + + return serialized; +} + } // namespace ada::serializers diff --git a/tests/basic_tests.cpp b/tests/basic_tests.cpp index d1d452a42..be35463e1 100644 --- a/tests/basic_tests.cpp +++ b/tests/basic_tests.cpp @@ -462,4 +462,11 @@ TYPED_TEST(basic_tests, negativeport) { auto url = ada::parse("https://www.google.com"); ASSERT_FALSE(url->set_port("-1")); SUCCEED(); +} + +TYPED_TEST(basic_tests, data_url) { + auto data_url = ada::data_url::parse_data_url("data:application/octet-stream;base64,YWJj"); + ASSERT_TRUE(data_url.is_valid); + ASSERT_EQ(data_url.essence, "application/octet-stream"); + ASSERT_EQ(data_url.body, "YWJj"); } \ No newline at end of file From de37309fd2ab364ffbbb6e708bbb07b759ad04b2 Mon Sep 17 00:00:00 2001 From: Khafra Date: Thu, 24 Oct 2024 18:58:48 -0400 Subject: [PATCH 2/2] fixup --- include/ada/ada_data_url.h | 4 ++-- src/ada_data_url.cpp | 28 +++++++++++++++++++--------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/ada/ada_data_url.h b/include/ada/ada_data_url.h index 8f06b780e..e9fc456f5 100644 --- a/include/ada/ada_data_url.h +++ b/include/ada/ada_data_url.h @@ -22,9 +22,9 @@ ada::data_url::data_url parse_data_url(std::string_view data_url); std::string collect_sequence_of_code_points(char c, const std::string& input, size_t& position); -bool isASCIIWhiteSpace(char c); +bool is_ascii_whitespace(char c); -std::string removeASCIIWhiteSpace(const std::string& input, bool leading, bool trailing); +std::string remove_ascii_whitespace(std::string input, bool leading, bool trailing); static constexpr bool is_base64(std::string_view input); diff --git a/src/ada_data_url.cpp b/src/ada_data_url.cpp index 2a1fc1881..a5c831c6f 100644 --- a/src/ada_data_url.cpp +++ b/src/ada_data_url.cpp @@ -33,7 +33,7 @@ ada::data_url::data_url parse_data_url(std::string_view data_url) { auto mimetype_length = mimetype.length(); // 6. Strip leading and trailing ASCII whitespace from mimeType. - mimetype = removeASCIIWhiteSpace(mimetype, true, true); + mimetype = remove_ascii_whitespace(mimetype, true, true); // 7. If position is past the end of input, then return failure. if (position >= input.length()) { @@ -89,33 +89,43 @@ std::string collect_sequence_of_code_points(char c, const std::string& input, si size_t start = position; if (idx == std::string::npos) { - position = reinterpret_cast(input.length()); + position = static_cast(input.length()); return input.substr(start); } - position = reinterpret_cast(idx); + position = static_cast(idx); return input.substr(start, position); } -std::string removeASCIIWhiteSpace(const std::string& input, bool leading, bool trailing) { +std::string remove_ascii_whitespace(std::string input, bool leading, bool trailing) { size_t lead = 0; size_t trail = input.length(); if (leading) { - while (lead < input.length() && isASCIIWhiteSpace(input[lead])) + while (lead < input.length() && is_ascii_whitespace(input[lead])) { lead++; + } + + if (lead != 0) { + input.erase(lead); + } } if (trailing) { - while (trail > 0 && isASCIIWhiteSpace(input[trail])) + while (trail > 0 && is_ascii_whitespace(input[trail])) { trail--; + } + + if (trail != input.length()) { + input.resize(input.length() - trail); + } } - return input.substr(lead, trail); + return input; } -bool isASCIIWhiteSpace(char c) { - return c == '\r' || c == '\n' || c == '\t' || c == '\f'; +bool is_ascii_whitespace(char c) { + return c == '\r' || c == '\n' || c == '\t' || c == '\f' || c == ' '; } static constexpr bool is_base64(std::string_view input) {