From 382def940b2ec44edd0ed450a36b92badca5e9d4 Mon Sep 17 00:00:00 2001
From: Khafra <maitken033380023@gmail.com>
Date: Sun, 6 Oct 2024 23:55:55 -0400
Subject: [PATCH 1/2] implement data url parsing

Co-authored-by: Yagiz Nizipli <yagiz@nizipli.com>
---
 include/ada.h              |   1 +
 include/ada/ada_data_url.h |  33 ++++++++++
 include/ada/serializers.h  |   2 +
 src/ada.cpp                |   3 +-
 src/ada_data_url.cpp       | 132 +++++++++++++++++++++++++++++++++++++
 src/serializers.cpp        |  18 +++++
 tests/basic_tests.cpp      |   7 ++
 7 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 include/ada/ada_data_url.h
 create mode 100644 src/ada_data_url.cpp
diff --git a/include/ada.h b/include/ada.h
index c5d0946ec..04eacb168 100644
--- a/include/ada.h
+++ b/include/ada.h
@@ -9,6 +9,7 @@
 #include "ada/character_sets-inl.h"
 #include "ada/checkers-inl.h"
 #include "ada/common_defs.h"
+#include "ada/ada_data_url.h"
 #include "ada/log.h"
 #include "ada/encoding_type.h"
 #include "ada/helpers.h"
diff --git a/include/ada/ada_data_url.h b/include/ada/ada_data_url.h
new file mode 100644
index 000000000..8f06b780e
--- /dev/null
+++ b/include/ada/ada_data_url.h
@@ -0,0 +1,33 @@
+#ifndef ADA_DATA_URL_H
+#define ADA_DATA_URL_H
+
+#include <string_view>
+
+namespace ada::data_url {
+// https://fetch.spec.whatwg.org/#data-url-struct
+struct data_url {
+    data_url() = default;
+    data_url(const data_url &m) = default;
+    data_url(data_url &&m) noexcept = default;
+    data_url &operator=(data_url &&m) noexcept = default;
+    data_url &operator=(const data_url &m) = default;
+    ~data_url() = default;
+
+    bool is_valid = true;
+    std::string body{};
+    std::string essence{};
+};
+
+ada::data_url::data_url parse_data_url(std::string_view data_url);
+
+std::string collect_sequence_of_code_points(char c, const std::string& input, size_t& position);
+
+bool isASCIIWhiteSpace(char c);
+
+std::string removeASCIIWhiteSpace(const std::string& input, bool leading, bool trailing);
+
+static constexpr bool is_base64(std::string_view input);
+
+}
+
+#endif  // ADA_DATA_URL_H
diff --git a/include/ada/serializers.h b/include/ada/serializers.h
index d8e0d3d6f..b2e27fbb1 100644
--- a/include/ada/serializers.h
+++ b/include/ada/serializers.h
@@ -40,6 +40,8 @@ std::string ipv6(const std::array<uint16_t, 8>& address) noexcept;
  */
 std::string ipv4(uint64_t address) noexcept;
 
+std::string url_serializer(const ada::url& url, bool excludeFragment) noexcept;
+
 }  // namespace ada::serializers
 
 #endif  // ADA_SERIALIZERS_H
diff --git a/src/ada.cpp b/src/ada.cpp
index 164f37d74..99109e380 100644
--- a/src/ada.cpp
+++ b/src/ada.cpp
@@ -10,4 +10,5 @@
 #include "parser.cpp"
 #include "url_components.cpp"
 #include "url_aggregator.cpp"
-#include "ada_c.cpp"
\ No newline at end of file
+#include "ada_c.cpp"
+#include "ada_data_url.cpp"
\ No newline at end of file
diff --git a/src/ada_data_url.cpp b/src/ada_data_url.cpp
new file mode 100644
index 000000000..2a1fc1881
--- /dev/null
+++ b/src/ada_data_url.cpp
@@ -0,0 +1,132 @@
+#include <string_view>
+#include <cctype>
+
+#include "ada.h"
+
+namespace ada::data_url {
+
+ada::data_url::data_url parse_data_url(std::string_view data_url) {
+  auto out = ada::data_url::data_url();
+
+  auto url = ada::parse<ada::url>(data_url, nullptr);
+
+  // 1. Assert: dataURL’s scheme is "data".
+  if (!url || url->get_protocol() != "data:") {
+      out.is_valid = false;
+      return out;
+  }
+
+  // 2. Let input be the result of running the URL serializer on dataURL with exclude
+  //    fragment set to true.
+  url->set_hash({});
+  auto input = url->get_href();
+
+  // 3. Remove the leading "data:" from input.
+  input.erase(0, 5);
+
+  // 4. Let position point at the start of input.
+  size_t position = 0;
+
+  // 5. Let mimeType be the result of collecting a sequence of code points that are
+  //    not equal to U+002C (,), given position.
+  auto mimetype = collect_sequence_of_code_points(',', input, position);
+  auto mimetype_length = mimetype.length();
+
+  // 6. Strip leading and trailing ASCII whitespace from mimeType.
+  mimetype = removeASCIIWhiteSpace(mimetype, true, true);
+
+  // 7. If position is past the end of input, then return failure.
+  if (position >= input.length()) {
+      out.is_valid = false;
+      return out;
+  }
+
+  // 8. Advance position by 1.
+  position++;
+
+  // 9. Let encodedBody be the remainder of input.
+  std::string encoded_body = input.substr(mimetype_length + 1);
+
+  // 10. Let body be the percent-decoding of encodedBody.
+  encoded_body = ada::unicode::percent_decode(encoded_body, encoded_body.find('%'));
+
+  // 11. If mimeType ends with U+003B (;), followed by zero or more U+0020 SPACE,
+  //     followed by an ASCII case-insensitive match for "base64", then:
+  size_t last_semi_colon = input.find_last_of(';');
+
+  if (last_semi_colon != std::string::npos) {
+    size_t next_non_space = input.find_first_not_of(' ', last_semi_colon);
+
+    out.essence = mimetype.substr(0, last_semi_colon);
+
+    if (is_base64(mimetype)) {
+
+        // 11.1. Let stringBody be the isomorphic decode of body.
+        auto string_body = encoded_body;
+
+        // 11.2. Set body to the forgiving-base64 decode of stringBody.
+        // 11.3. If body is failure, then return failure.
+        // TODO
+        out.body = string_body;
+
+        // 11.4. Remove the last 6 code points from mimeType.
+        // 11.5. Remove trailing U+0020 SPACE code points from mimeType, if any.
+        // 11.6. Remove the last U+003B (;) from mimeType.
+        mimetype.erase(last_semi_colon);
+    }
+  }
+
+  // 12. If mimeType starts with ";", then prepend "text/plain" to mimeType.
+  if (mimetype.starts_with(';')) {
+      mimetype = "text/plain" + mimetype;
+  }
+
+  return out;
+}
+
+std::string collect_sequence_of_code_points(char c, const std::string& input, size_t& position) {
+    auto idx = input.find_first_of(c, position);
+    size_t start = position;
+
+    if (idx == std::string::npos) {
+        position = reinterpret_cast<size_t>(input.length());
+        return input.substr(start);
+    }
+
+    position = reinterpret_cast<size_t>(idx);
+    return input.substr(start, position);
+}
+
+std::string removeASCIIWhiteSpace(const std::string& input, bool leading, bool trailing) {
+    size_t lead = 0;
+    size_t trail = input.length();
+
+    if (leading) {
+        while (lead < input.length() && isASCIIWhiteSpace(input[lead]))
+            lead++;
+    }
+
+    if (trailing) {
+        while (trail > 0 && isASCIIWhiteSpace(input[trail]))
+            trail--;
+    }
+
+    return input.substr(lead, trail);
+}
+
+bool isASCIIWhiteSpace(char c) {
+    return c == '\r' || c == '\n' || c == '\t' || c == '\f';
+}
+
+static constexpr bool is_base64(std::string_view input) {
+    auto last_idx = input.find_last_of(';');
+    if (last_idx != std::string_view::npos) {
+        // TODO(@anonrig): Trim input
+        auto res = input.substr(last_idx + 1);
+        return res.size() == 6 && (res[0] | 0x20) == 'b' && (res[1] | 0x20) == 'a' &&
+               (res[2] | 0x20) == 's' && (res[3] | 0x20) == 'e' && (res[4] == '6') && (res[5] == '4');
+    }
+    return false;
+}
+
+}
diff --git a/src/serializers.cpp b/src/serializers.cpp
index 91be39ce1..cc9a9974b 100644
--- a/src/serializers.cpp
+++ b/src/serializers.cpp
@@ -77,4 +77,22 @@ std::string ipv4(const uint64_t address) noexcept {
   return output;
 }
 
+std::string url_serializer(const ada::url& url, bool excludeFragment) noexcept {
+    if (!excludeFragment) {
+        return url.get_href();
+    }
+
+    std::string href = url.get_href();
+    size_t hashLength = url.has_hash() ? url.get_hash().size() : 0;
+
+    std::string serialized = hashLength == 0 ? href : href.substr(0, href.length() - hashLength);
+
+    if (hashLength == 0 && href.ends_with('#')) {
+        serialized.pop_back();
+        return serialized;
+    }
+
+    return serialized;
+}
+
 }  // namespace ada::serializers
diff --git a/tests/basic_tests.cpp b/tests/basic_tests.cpp
index d1d452a42..be35463e1 100644
--- a/tests/basic_tests.cpp
+++ b/tests/basic_tests.cpp
@@ -462,4 +462,11 @@ TYPED_TEST(basic_tests, negativeport) {
   auto url = ada::parse<TypeParam>("https://www.google.com");
   ASSERT_FALSE(url->set_port("-1"));
   SUCCEED();
+}
+
+TYPED_TEST(basic_tests, data_url) {
+    auto data_url = ada::data_url::parse_data_url("data:application/octet-stream;base64,YWJj");
+    ASSERT_TRUE(data_url.is_valid);
+    ASSERT_EQ(data_url.essence, "application/octet-stream");
+    ASSERT_EQ(data_url.body, "YWJj");
 }
\ No newline at end of file

From de37309fd2ab364ffbbb6e708bbb07b759ad04b2 Mon Sep 17 00:00:00 2001
From: Khafra <maitken033380023@gmail.com>
Date: Thu, 24 Oct 2024 18:58:48 -0400
Subject: [PATCH 2/2] fixup

---
 include/ada/ada_data_url.h |  4 ++--
 src/ada_data_url.cpp       | 28 +++++++++++++++++++---------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/ada/ada_data_url.h b/include/ada/ada_data_url.h
index 8f06b780e..e9fc456f5 100644
--- a/include/ada/ada_data_url.h
+++ b/include/ada/ada_data_url.h
@@ -22,9 +22,9 @@ ada::data_url::data_url parse_data_url(std::string_view data_url);
 
 std::string collect_sequence_of_code_points(char c, const std::string& input, size_t& position);
 
-bool isASCIIWhiteSpace(char c);
+bool is_ascii_whitespace(char c);
 
-std::string removeASCIIWhiteSpace(const std::string& input, bool leading, bool trailing);
+std::string remove_ascii_whitespace(std::string input, bool leading, bool trailing);
 
 static constexpr bool is_base64(std::string_view input);
 
diff --git a/src/ada_data_url.cpp b/src/ada_data_url.cpp
index 2a1fc1881..a5c831c6f 100644
--- a/src/ada_data_url.cpp
+++ b/src/ada_data_url.cpp
@@ -33,7 +33,7 @@ ada::data_url::data_url parse_data_url(std::string_view data_url) {
   auto mimetype_length = mimetype.length();
 
   // 6. Strip leading and trailing ASCII whitespace from mimeType.
-  mimetype = removeASCIIWhiteSpace(mimetype, true, true);
+  mimetype = remove_ascii_whitespace(mimetype, true, true);
 
   // 7. If position is past the end of input, then return failure.
   if (position >= input.length()) {
@@ -89,33 +89,43 @@ std::string collect_sequence_of_code_points(char c, const std::string& input, si
     size_t start = position;
 
     if (idx == std::string::npos) {
-        position = reinterpret_cast<size_t>(input.length());
+        position = static_cast<size_t>(input.length());
         return input.substr(start);
     }
 
-    position = reinterpret_cast<size_t>(idx);
+    position = static_cast<size_t>(idx);
     return input.substr(start, position);
 }
 
-std::string removeASCIIWhiteSpace(const std::string& input, bool leading, bool trailing) {
+std::string remove_ascii_whitespace(std::string input, bool leading, bool trailing) {
     size_t lead = 0;
     size_t trail = input.length();
 
     if (leading) {
-        while (lead < input.length() && isASCIIWhiteSpace(input[lead]))
+        while (lead < input.length() && is_ascii_whitespace(input[lead])) {
             lead++;
+        }
+
+        if (lead != 0) {
+            input.erase(lead);
+        }
     }
 
     if (trailing) {
-        while (trail > 0 && isASCIIWhiteSpace(input[trail]))
+        while (trail > 0 && is_ascii_whitespace(input[trail])) {
             trail--;
+        }
+
+        if (trail != input.length()) {
+            input.resize(input.length() - trail);
+        }
     }
 
-    return input.substr(lead, trail);
+    return input;
 }
 
-bool isASCIIWhiteSpace(char c) {
-    return c == '\r' || c == '\n' || c == '\t' || c == '\f';
+bool is_ascii_whitespace(char c) {
+    return c == '\r' || c == '\n' || c == '\t' || c == '\f' || c == ' ';
 }
 
 static constexpr bool is_base64(std::string_view input) {