Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement data url parsing #756

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/ada.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "ada/character_sets-inl.h"
#include "ada/checkers-inl.h"
#include "ada/common_defs.h"
#include "ada/ada_data_url.h"
#include "ada/log.h"
#include "ada/encoding_type.h"
#include "ada/helpers.h"
Expand Down
33 changes: 33 additions & 0 deletions include/ada/ada_data_url.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#ifndef ADA_DATA_URL_H
#define ADA_DATA_URL_H

#include <string_view>

namespace ada::data_url {
// https://fetch.spec.whatwg.org/#data-url-struct
struct data_url {
data_url() = default;
data_url(const data_url &m) = default;
data_url(data_url &&m) noexcept = default;
data_url &operator=(data_url &&m) noexcept = default;
data_url &operator=(const data_url &m) = default;
~data_url() = default;

bool is_valid = true;
std::string body{};
std::string essence{};
};

ada::data_url::data_url parse_data_url(std::string_view data_url);

std::string collect_sequence_of_code_points(char c, const std::string& input, size_t& position);

bool isASCIIWhiteSpace(char c);

std::string removeASCIIWhiteSpace(const std::string& input, bool leading, bool trailing);

static constexpr bool is_base64(std::string_view input);

}

#endif // ADA_DATA_URL_H
2 changes: 2 additions & 0 deletions include/ada/serializers.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ std::string ipv6(const std::array<uint16_t, 8>& address) noexcept;
*/
std::string ipv4(uint64_t address) noexcept;

std::string url_serializer(const ada::url& url, bool excludeFragment) noexcept;

} // namespace ada::serializers

#endif // ADA_SERIALIZERS_H
3 changes: 2 additions & 1 deletion src/ada.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@
#include "parser.cpp"
#include "url_components.cpp"
#include "url_aggregator.cpp"
#include "ada_c.cpp"
#include "ada_c.cpp"
#include "ada_data_url.cpp"
132 changes: 132 additions & 0 deletions src/ada_data_url.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#include <string_view>
#include <cctype>

#include "ada.h"

namespace ada::data_url {

ada::data_url::data_url parse_data_url(std::string_view data_url) {
auto out = ada::data_url::data_url();

auto url = ada::parse<ada::url>(data_url, nullptr);

// 1. Assert: dataURL’s scheme is "data".
if (!url || url->get_protocol() != "data:") {
out.is_valid = false;
return out;
}

// 2. Let input be the result of running the URL serializer on dataURL with exclude
// fragment set to true.
url->set_hash({});
auto input = url->get_href();

// 3. Remove the leading "data:" from input.
input.erase(0, 5);

// 4. Let position point at the start of input.
size_t position = 0;

// 5. Let mimeType be the result of collecting a sequence of code points that are
// not equal to U+002C (,), given position.
auto mimetype = collect_sequence_of_code_points(',', input, position);
auto mimetype_length = mimetype.length();

// 6. Strip leading and trailing ASCII whitespace from mimeType.
mimetype = removeASCIIWhiteSpace(mimetype, true, true);
KhafraDev marked this conversation as resolved.
Show resolved Hide resolved

// 7. If position is past the end of input, then return failure.
if (position >= input.length()) {
out.is_valid = false;
return out;
}

// 8. Advance position by 1.
position++;

// 9. Let encodedBody be the remainder of input.
std::string encoded_body = input.substr(mimetype_length + 1);

// 10. Let body be the percent-decoding of encodedBody.
encoded_body = ada::unicode::percent_decode(encoded_body, encoded_body.find('%'));

// 11. If mimeType ends with U+003B (;), followed by zero or more U+0020 SPACE,
// followed by an ASCII case-insensitive match for "base64", then:
size_t last_semi_colon = input.find_last_of(';');

if (last_semi_colon != std::string::npos) {
size_t next_non_space = input.find_first_not_of(' ', last_semi_colon);

out.essence = mimetype.substr(0, last_semi_colon);

if (is_base64(mimetype)) {

// 11.1. Let stringBody be the isomorphic decode of body.
auto string_body = encoded_body;

// 11.2. Set body to the forgiving-base64 decode of stringBody.
// 11.3. If body is failure, then return failure.
// TODO
out.body = string_body;

// 11.4. Remove the last 6 code points from mimeType.
// 11.5. Remove trailing U+0020 SPACE code points from mimeType, if any.
// 11.6. Remove the last U+003B (;) from mimeType.
mimetype.erase(last_semi_colon);
}
}

// 12. If mimeType starts with ";", then prepend "text/plain" to mimeType.
if (mimetype.starts_with(';')) {
mimetype = "text/plain" + mimetype;
}

return out;
}

std::string collect_sequence_of_code_points(char c, const std::string& input, size_t& position) {
auto idx = input.find_first_of(c, position);
size_t start = position;

if (idx == std::string::npos) {
position = reinterpret_cast<size_t>(input.length());
KhafraDev marked this conversation as resolved.
Show resolved Hide resolved
return input.substr(start);
}

position = reinterpret_cast<size_t>(idx);
KhafraDev marked this conversation as resolved.
Show resolved Hide resolved
return input.substr(start, position);
}

std::string removeASCIIWhiteSpace(const std::string& input, bool leading, bool trailing) {
KhafraDev marked this conversation as resolved.
Show resolved Hide resolved
size_t lead = 0;
size_t trail = input.length();

if (leading) {
while (lead < input.length() && isASCIIWhiteSpace(input[lead]))
KhafraDev marked this conversation as resolved.
Show resolved Hide resolved
lead++;
}

if (trailing) {
while (trail > 0 && isASCIIWhiteSpace(input[trail]))
trail--;
}

return input.substr(lead, trail);
}

bool isASCIIWhiteSpace(char c) {
KhafraDev marked this conversation as resolved.
Show resolved Hide resolved
return c == '\r' || c == '\n' || c == '\t' || c == '\f';
KhafraDev marked this conversation as resolved.
Show resolved Hide resolved
}

static constexpr bool is_base64(std::string_view input) {
auto last_idx = input.find_last_of(';');
if (last_idx != std::string_view::npos) {
// TODO(@anonrig): Trim input
auto res = input.substr(last_idx + 1);
return res.size() == 6 && (res[0] | 0x20) == 'b' && (res[1] | 0x20) == 'a' &&
(res[2] | 0x20) == 's' && (res[3] | 0x20) == 'e' && (res[4] == '6') && (res[5] == '4');
}
return false;
}

}
18 changes: 18 additions & 0 deletions src/serializers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,22 @@ std::string ipv4(const uint64_t address) noexcept {
return output;
}

std::string url_serializer(const ada::url& url, bool excludeFragment) noexcept {
if (!excludeFragment) {
return url.get_href();
}

std::string href = url.get_href();
size_t hashLength = url.has_hash() ? url.get_hash().size() : 0;

std::string serialized = hashLength == 0 ? href : href.substr(0, href.length() - hashLength);

if (hashLength == 0 && href.ends_with('#')) {
serialized.pop_back();
return serialized;
}

return serialized;
}

} // namespace ada::serializers
7 changes: 7 additions & 0 deletions tests/basic_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,4 +462,11 @@ TYPED_TEST(basic_tests, negativeport) {
auto url = ada::parse<TypeParam>("https://www.google.com");
ASSERT_FALSE(url->set_port("-1"));
SUCCEED();
}

TYPED_TEST(basic_tests, data_url) {
auto data_url = ada::data_url::parse_data_url("data:application/octet-stream;base64,YWJj");
ASSERT_TRUE(data_url.is_valid);
ASSERT_EQ(data_url.essence, "application/octet-stream");
ASSERT_EQ(data_url.body, "YWJj");
}
Loading