From 21adb013aa1819f224deb8111a5065c96b58d005 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 27 Jan 2023 15:49:13 -0500 Subject: [PATCH] deps: replace url parser with Ada --- LICENSE | 22 + Makefile | 2 +- deps/ada/ada.cpp | 2485 +++++++++++++++++ deps/ada/ada.gyp | 29 + deps/ada/ada.h | 1846 ++++++++++++ lib/internal/url.js | 459 +-- node.gyp | 36 +- src/crypto/crypto_common.cc | 7 +- src/inspector_agent.cc | 4 +- src/module_wrap.cc | 3 - src/node_api.cc | 2 +- src/node_url.cc | 2029 ++------------ src/node_url.h | 194 +- src/node_url_tables.cc | 448 --- test/cctest/test_url.cc | 218 -- test/fuzzers/fuzz_url.cc | 11 - .../test-whatwg-url-custom-inspect.js | 18 +- tools/license-builder.sh | 2 + 18 files changed, 4691 insertions(+), 3124 deletions(-) create mode 100644 deps/ada/ada.cpp create mode 100644 deps/ada/ada.gyp create mode 100644 deps/ada/ada.h delete mode 100644 src/node_url_tables.cc delete mode 100644 test/cctest/test_url.cc delete mode 100644 test/fuzzers/fuzz_url.cc diff --git a/LICENSE b/LICENSE index f8fa687202dcb9..93ece9530ee905 100644 --- a/LICENSE +++ b/LICENSE @@ -1338,6 +1338,28 @@ The externally maintained libraries used by Node.js are: CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +- ada, located at deps/ada, is licensed as follows: + """ + Copyright 2022 Ada authors + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + the Software, and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + """ + - npm, located at deps/npm, is licensed as follows: """ The npm application diff --git a/Makefile b/Makefile index 94013466239e9c..7648063958a199 100644 --- a/Makefile +++ b/Makefile @@ -170,7 +170,7 @@ with-code-cache test-code-cache: out/Makefile: config.gypi common.gypi node.gyp \ deps/uv/uv.gyp deps/llhttp/llhttp.gyp deps/zlib/zlib.gyp \ - deps/simdutf/simdutf.gyp \ + deps/simdutf/simdutf.gyp deps/ada/ada.gyp \ tools/v8_gypfiles/toolchain.gypi tools/v8_gypfiles/features.gypi \ tools/v8_gypfiles/inspector.gypi tools/v8_gypfiles/v8.gyp $(PYTHON) tools/gyp_node.py -f make diff --git a/deps/ada/ada.cpp b/deps/ada/ada.cpp new file mode 100644 index 00000000000000..70b01ebabe880d --- /dev/null +++ b/deps/ada/ada.cpp @@ -0,0 +1,2485 @@ +/* auto-generated on 2023-01-30 11:28:20 -0500. Do not edit! */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/src, filename=ada.cpp +/* begin file src/ada.cpp */ +#include "ada.h" +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/src, filename=checkers.cpp +/* begin file src/checkers.cpp */ +#include + +namespace ada::checkers { + + ada_really_inline ada_constexpr bool is_ipv4(std::string_view view) noexcept { + size_t last_dot = view.rfind('.'); + if(last_dot == view.size() - 1) { + view.remove_suffix(1); + last_dot = view.rfind('.'); + } + std::string_view number = (last_dot == std::string_view::npos) ? view : view.substr(last_dot+1); + if(number.empty()) { return false; } + /** Optimization opportunity: we have basically identified the last number of the + ipv4 if we return true here. We might as well parse it and have at least one + number parsed when we get to parse_ipv4. */ + if(std::all_of(number.begin(), number.end(), ada::checkers::is_digit)) { return true; } + return (checkers::has_hex_prefix(number) && std::all_of(number.begin()+2, number.end(), ada::unicode::is_lowercase_hex)); + } + + + // for use with path_signature + static constexpr uint8_t path_signature_table[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + + ada_really_inline constexpr uint8_t path_signature(std::string_view input) noexcept { + size_t i = 0; + uint8_t accumulator{}; + for (; i + 7 < input.size(); i += 8) { + accumulator |= uint8_t(path_signature_table[uint8_t(input[i])] | + path_signature_table[uint8_t(input[i + 1])] | + path_signature_table[uint8_t(input[i + 2])] | + path_signature_table[uint8_t(input[i + 3])] | + path_signature_table[uint8_t(input[i + 4])] | + path_signature_table[uint8_t(input[i + 5])] | + path_signature_table[uint8_t(input[i + 6])] | + path_signature_table[uint8_t(input[i + 7])]); + } + for (; i < input.size(); i++) { + accumulator |= path_signature_table[uint8_t(input[i])]; + } + return accumulator; + } + + + ada_really_inline constexpr bool verify_dns_length(std::string_view input) noexcept { + if(input.back() == '.') { + if(input.size() > 254) return false; + } else if (input.size() > 253) return false; + + size_t start = 0; + while (start < input.size()) { + auto dot_location = input.find('.', start); + // If not found, it's likely the end of the domain + if(dot_location == std::string_view::npos) dot_location = input.size(); + + auto label_size = dot_location - start; + if (label_size > 63 || label_size == 0) return false; + + start = dot_location + 1; + } + + return true; + } +} // namespace ada::checkers +/* end file src/checkers.cpp */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/src, filename=unicode.cpp +/* begin file src/unicode.cpp */ + +#include +#ifdef _WIN32 +#ifndef __wtypes_h__ +#include +#endif + +#ifndef __WINDEF_ +#include +#endif +#include +#else +#include +#include +#include +#endif +namespace ada::unicode { + + ada_really_inline constexpr bool has_tabs_or_newline(std::string_view user_input) noexcept { + auto has_zero_byte = [](uint64_t v) { + return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080); + }; + auto broadcast = [](uint8_t v) -> uint64_t { return 0x101010101010101 * v; }; + size_t i = 0; + uint64_t mask1 = broadcast('\r'); + uint64_t mask2 = broadcast('\n'); + uint64_t mask3 = broadcast('\t'); + uint64_t running{0}; + for (; i + 7 < user_input.size(); i += 8) { + uint64_t word{}; + memcpy(&word, user_input.data() + i, sizeof(word)); + uint64_t xor1 = word ^ mask1; + uint64_t xor2 = word ^ mask2; + uint64_t xor3 = word ^ mask3; + running |= has_zero_byte(xor1) | has_zero_byte(xor2) | has_zero_byte(xor3); + } + if (i < user_input.size()) { + uint64_t word{}; + memcpy(&word, user_input.data() + i, user_input.size() - i); + uint64_t xor1 = word ^ mask1; + uint64_t xor2 = word ^ mask2; + uint64_t xor3 = word ^ mask3; + running |= has_zero_byte(xor1) | has_zero_byte(xor2) | has_zero_byte(xor3); + } + return running; + } + + // A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR, U+0020 SPACE, U+0023 (#), + // U+002F (/), U+003A (:), U+003C (<), U+003E (>), U+003F (?), U+0040 (@), U+005B ([), U+005C (\), U+005D (]), + // U+005E (^), or U+007C (|). + constexpr static bool is_forbidden_host_code_point_table[] = { + 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + static_assert(sizeof(is_forbidden_host_code_point_table) == 256); + + ada_really_inline constexpr bool is_forbidden_host_code_point(const char c) noexcept { + return is_forbidden_host_code_point_table[uint8_t(c)]; + } + + static_assert(unicode::is_forbidden_host_code_point('\0')); + static_assert(unicode::is_forbidden_host_code_point('\t')); + static_assert(unicode::is_forbidden_host_code_point('\n')); + static_assert(unicode::is_forbidden_host_code_point('\r')); + static_assert(unicode::is_forbidden_host_code_point(' ')); + static_assert(unicode::is_forbidden_host_code_point('#')); + static_assert(unicode::is_forbidden_host_code_point('/')); + static_assert(unicode::is_forbidden_host_code_point(':')); + static_assert(unicode::is_forbidden_host_code_point('?')); + static_assert(unicode::is_forbidden_host_code_point('@')); + static_assert(unicode::is_forbidden_host_code_point('[')); + static_assert(unicode::is_forbidden_host_code_point('?')); + static_assert(unicode::is_forbidden_host_code_point('<')); + static_assert(unicode::is_forbidden_host_code_point('>')); + static_assert(unicode::is_forbidden_host_code_point('\\')); + static_assert(unicode::is_forbidden_host_code_point(']')); + static_assert(unicode::is_forbidden_host_code_point('^')); + static_assert(unicode::is_forbidden_host_code_point('|')); + +constexpr static bool is_forbidden_domain_code_point_table[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + + static_assert(sizeof(is_forbidden_domain_code_point_table) == 256); + + ada_really_inline constexpr bool is_forbidden_domain_code_point(const char c) noexcept { + // abort(); + return is_forbidden_domain_code_point_table[uint8_t(c)]; + // A table is almost surely much faster than the + // following under most compilers: return + // is_forbidden_host_code_point(c) | + // std::iscntrl(c) | c == '%' | c == '\x7f'; + } + + static_assert(unicode::is_forbidden_domain_code_point('%')); + static_assert(unicode::is_forbidden_domain_code_point('\x7f')); + static_assert(unicode::is_forbidden_domain_code_point('\0')); + static_assert(unicode::is_forbidden_domain_code_point('\t')); + static_assert(unicode::is_forbidden_domain_code_point('\n')); + static_assert(unicode::is_forbidden_domain_code_point('\r')); + static_assert(unicode::is_forbidden_domain_code_point(' ')); + static_assert(unicode::is_forbidden_domain_code_point('#')); + static_assert(unicode::is_forbidden_domain_code_point('/')); + static_assert(unicode::is_forbidden_domain_code_point(':')); + static_assert(unicode::is_forbidden_domain_code_point('?')); + static_assert(unicode::is_forbidden_domain_code_point('@')); + static_assert(unicode::is_forbidden_domain_code_point('[')); + static_assert(unicode::is_forbidden_domain_code_point('?')); + static_assert(unicode::is_forbidden_domain_code_point('<')); + static_assert(unicode::is_forbidden_domain_code_point('>')); + static_assert(unicode::is_forbidden_domain_code_point('\\')); + static_assert(unicode::is_forbidden_domain_code_point(']')); + static_assert(unicode::is_forbidden_domain_code_point('^')); + static_assert(unicode::is_forbidden_domain_code_point('|')); + + constexpr static bool is_alnum_plus_table[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + static_assert(sizeof(is_alnum_plus_table) == 256); + + ada_really_inline constexpr bool is_alnum_plus(const char c) noexcept { + return is_alnum_plus_table[uint8_t(c)]; + // A table is almost surely much faster than the + // following under most compilers: return + // return (std::isalnum(c) || c == '+' || c == '-' || c == '.'); + } + static_assert(unicode::is_alnum_plus('+')); + static_assert(unicode::is_alnum_plus('-')); + static_assert(unicode::is_alnum_plus('.')); + static_assert(unicode::is_alnum_plus('0')); + static_assert(unicode::is_alnum_plus('1')); + static_assert(unicode::is_alnum_plus('a')); + static_assert(unicode::is_alnum_plus('b')); + + ada_really_inline constexpr bool is_ascii_hex_digit(const char c) noexcept { + return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c<= 'f'); + } + + ada_really_inline constexpr bool is_c0_control_or_space(const char c) noexcept { + return (unsigned char) c <= ' '; + } + + ada_really_inline constexpr bool is_ascii_tab_or_newline(const char c) noexcept { + return c == '\t' || c == '\n' || c == '\r'; + } + + constexpr std::string_view table_is_double_dot_path_segment[] = {"..", "%2e.", ".%2e", "%2e%2e"}; + + ada_really_inline ada_constexpr bool is_double_dot_path_segment(std::string_view input) noexcept { + // This will catch most cases: + // The length must be 2,4 or 6. + // We divide by two and require + // that the result be between 1 and 3 inclusively. + uint64_t half_length = uint64_t(input.size())/2; + if(half_length - 1 > 2) { return false; } + // We have a string of length 2, 4 or 6. + // We now check the first character: + if((input[0] != '.') && (input[0] != '%')) { return false; } + // We are unlikely the get beyond this point. + int hash_value = (input.size() + (unsigned)(input[0])) & 3; + const std::string_view target = table_is_double_dot_path_segment[hash_value]; + if(target.size() != input.size()) { return false; } + // We almost never get here. + // Optimizing the rest is relatively unimportant. + auto prefix_equal_unsafe = [](std::string_view a, std::string_view b) { + uint16_t A, B; + memcpy(&A,a.data(), sizeof(A)); + memcpy(&B,b.data(), sizeof(B)); + return A == B; + }; + if(!prefix_equal_unsafe(input,target)) { return false; } + for(size_t i = 2; i < input.size(); i++) { + char c = input[i]; + if((uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c) != target[i]) { return false; } + } + return true; + // The above code might be a bit better than the code below. Compilers + // are not stupid and may use the fact that these strings have length 2,4 and 6 + // and other tricks. + //return input == ".." || + // input == ".%2e" || input == ".%2E" || + // input == "%2e." || input == "%2E." || + // input == "%2e%2e" || input == "%2E%2E" || input == "%2E%2e" || input == "%2e%2E"; + } + + ada_really_inline constexpr bool is_single_dot_path_segment(std::string_view input) noexcept { + return input == "." || input == "%2e" || input == "%2E"; + } + + ada_really_inline constexpr bool is_lowercase_hex(const char c) noexcept { + return (c >= '0' && c <= '9') || (c >= 'a' && c<= 'f'); + } + + unsigned constexpr convert_hex_to_binary(const char c) noexcept { + // this code can be optimized. + if (c <= '9') { return c - '0'; } + char del = c >= 'a' ? 'a' : 'A'; + return 10 + (c - del); + } + + std::string percent_decode(const std::string_view input, size_t first_percent) { + // next line is for safety only, we expect users to avoid calling percent_decode + // when first_percent is outside the range. + if(first_percent == std::string_view::npos) { return std::string(input); } + std::string dest(input.substr(0, first_percent)); + dest.reserve(input.length()); + const char* pointer = input.data() + first_percent; + const char* end = input.data() + input.size(); + // Optimization opportunity: if the following code gets + // called often, it can be optimized quite a bit. + while (pointer < end) { + const char ch = pointer[0]; + size_t remaining = end - pointer - 1; + if (ch != '%' || remaining < 2 || + (//ch == '%' && // It is unnecessary to check that ch == '%'. + (!is_ascii_hex_digit(pointer[1]) || + !is_ascii_hex_digit(pointer[2])))) { + dest += ch; + pointer++; + continue; + } else { + unsigned a = convert_hex_to_binary(pointer[1]); + unsigned b = convert_hex_to_binary(pointer[2]); + char c = static_cast(a * 16 + b); + dest += c; + pointer += 3; + } + } + return dest; + } + + std::string percent_encode(const std::string_view input, const uint8_t character_set[]) { + auto pointer = std::find_if(input.begin(), input.end(), [character_set](const char c) { + return character_sets::bit_at(character_set, c); + }); + // Optimization: Don't iterate if percent encode is not required + if (pointer == input.end()) { return std::string(input); } + + std::string result(input.substr(0,std::distance(input.begin(), pointer))); + result.reserve(input.length()); // in the worst case, percent encoding might produce 3 characters. + + for (;pointer != input.end(); pointer++) { + if (character_sets::bit_at(character_set, *pointer)) { + result.append(character_sets::hex + uint8_t(*pointer) * 4, 3); + } else { + result += *pointer; + } + } + + return result; + } + + + bool percent_encode(const std::string_view input, const uint8_t character_set[], std::string &out) { + auto pointer = std::find_if(input.begin(), input.end(), [character_set](const char c) { + return character_sets::bit_at(character_set, c); + }); + // Optimization: Don't iterate if percent encode is not required + if (pointer == input.end()) { return false; } + out.clear(); + out.append(input.data(), std::distance(input.begin(), pointer)); + + for (;pointer != input.end(); pointer++) { + if (character_sets::bit_at(character_set, *pointer)) { + out.append(character_sets::hex + uint8_t(*pointer) * 4, 3); + } else { + out += *pointer; + } + } + return true; + } + + bool to_ascii(std::optional& out, const std::string_view plain, const bool be_strict, size_t first_percent) { + std::string percent_decoded_buffer; + std::string_view input = plain; + if(first_percent != std::string_view::npos) { + percent_decoded_buffer = unicode::percent_decode(plain, first_percent); + input = percent_decoded_buffer; + } +#ifdef _WIN32 + // Windows function assumes UTF-16. + std::unique_ptr buffer(new char16_t[input.size()]); + auto convert = [](const char* buf, size_t len, char16_t* utf16_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{utf16_output}; + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 16; + while(pos < final_pos) { + *utf16_output++ = char16_t(buf[pos]); + pos++; + } + continue; + } + } + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf16_output++ = char16_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 1 >= len) { return 0; } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + // range check + uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { return 0; } + *utf16_output++ = char16_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 2 >= len) { return 0; } // minimal bound checking + + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } + // range check + uint32_t code_point = (leading_byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if (code_point < 0x800 || 0xffff < code_point || + (0xd7ff < code_point && code_point < 0xe000)) { + return 0; + } + *utf16_output++ = char16_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if(pos + 3 >= len) { return 0; } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; } + + // range check + uint32_t code_point = + (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; } + code_point -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + pos += 4; + } else { + return 0; + } + } + return int(utf16_output - start); + }; + size_t codepoints = convert(input.data(), input.size(), buffer.get()); + if(codepoints == 0) { + out = std::nullopt; + return false; + } + int required_buffer_size = IdnToAscii(IDN_ALLOW_UNASSIGNED, (LPCWSTR)buffer.get(), codepoints, NULL, 0); + + if(required_buffer_size == 0) { + out = std::nullopt; + return false; + } + + out = std::string(required_buffer_size, 0); + std::unique_ptr ascii_buffer(new char16_t[required_buffer_size]); + + required_buffer_size = IdnToAscii(IDN_ALLOW_UNASSIGNED, (LPCWSTR)buffer.get(), codepoints, (LPWSTR)ascii_buffer.get(), required_buffer_size); + if(required_buffer_size == 0) { + out = std::nullopt; + return false; + } + // This will not validate the punycode, so let us work it in reverse. + int test_reverse = IdnToUnicode(IDN_ALLOW_UNASSIGNED, (LPCWSTR)ascii_buffer.get(), required_buffer_size, NULL, 0); + if(test_reverse == 0) { + out = std::nullopt; + return false; + } + out = std::string(required_buffer_size, 0); + for(size_t i = 0; i < required_buffer_size; i++) { (*out)[i] = char(ascii_buffer.get()[i]); } +#else + out = std::string(255, 0); + + UErrorCode status = U_ZERO_ERROR; + uint32_t options = UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_ASCII; + + if (be_strict) { + options |= UIDNA_USE_STD3_RULES; + } + + UIDNA* uidna = uidna_openUTS46(options, &status); + if (U_FAILURE(status)) { + return false; + } + + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + // RFC 1035 section 2.3.4. + // The domain name must be at most 255 octets. + // It cannot contain a label longer than 63 octets. + // Thus we should never need more than 255 octets, if we + // do the domain name is in error. + int32_t length = uidna_nameToASCII_UTF8(uidna, + input.data(), + int32_t(input.length()), + out.value().data(), 255, + &info, + &status); + + if (status == U_BUFFER_OVERFLOW_ERROR) { + status = U_ZERO_ERROR; + out.value().resize(length); + // When be_strict is true, this should not be allowed! + length = uidna_nameToASCII_UTF8(uidna, + input.data(), + int32_t(input.length()), + out.value().data(), length, + &info, + &status); + } + + // A label contains hyphen-minus ('-') in the third and fourth positions. + info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; + // A label starts with a hyphen-minus ('-'). + info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN; + // A label ends with a hyphen-minus ('-'). + info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; + + if (!be_strict) { // This seems to violate RFC 1035 section 2.3.4. + // A non-final domain name label (or the whole domain name) is empty. + info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; + // A domain name label is longer than 63 bytes. + info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; + // A domain name is longer than 255 bytes in its storage form. + info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; + } + + uidna_close(uidna); + + if (U_FAILURE(status) || info.errors != 0 || length == 0) { + out = std::nullopt; + return false; + } + out.value().resize(length); // we possibly want to call :shrink_to_fit otherwise we use 255 bytes. + out.value().shrink_to_fit(); +#endif + if(std::any_of(out.value().begin(), out.value().end(), ada::unicode::is_forbidden_domain_code_point)) { + out = std::nullopt; + return false; + } + return true; + } + +} // namespace ada::unicode +/* end file src/unicode.cpp */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/src, filename=serializers.cpp +/* begin file src/serializers.cpp */ +#include +#include + +namespace ada::serializers { + + size_t find_longest_sequence_of_ipv6_pieces(const std::array& address) noexcept { + size_t max_index = -1; + size_t max_length = 1; + size_t current_start = -1; + size_t current_length = 0; + + for (size_t i = 0; i < 8; i++) { + if (address[i] != 0) { + if (current_length > max_length) { + max_index = current_start; + max_length = current_length; + } + + current_start = -1; + current_length = 0; + } else { + if (current_start == size_t(-1)) { + current_start = i; + } + current_length++; + } + } + + if (current_length > max_length) { + return current_start; + } + + return max_index; + } + + std::string ipv6(const std::array& address) noexcept { + // Let output be the empty string. + std::string output{}; + + // Let compress be an index to the first IPv6 piece in the first longest sequences of address’s IPv6 pieces that are 0. + size_t compress = find_longest_sequence_of_ipv6_pieces(address); + + // Let ignore0 be false. + bool ignore_0{false}; + + // For each pieceIndex in the range 0 to 7, inclusive: + for (size_t piece_index = 0; piece_index < 8; piece_index++) { + // If ignore0 is true and address[pieceIndex] is 0, then continue. + if (ignore_0 && address[piece_index] == 0) { + continue; + } + // Otherwise, if ignore0 is true, set ignore0 to false. + else if (ignore_0) { + ignore_0 = false; + } + + // If compress is pieceIndex, then: + if (compress == piece_index) { + // Let separator be "::" if pieceIndex is 0, and U+003A (:) otherwise. + // Append separator to output. + output += (piece_index == 0) ? "::" : ":"; + + // Set ignore0 to true and continue. + ignore_0 = true; + continue; + } + + // Append address[pieceIndex], represented as the shortest possible lowercase hexadecimal number, to output. + char buf[5]; + snprintf(buf, sizeof(buf), "%x", address[piece_index]); + output += buf; + + // If pieceIndex is not 7, then append U+003A (:) to output. + if (piece_index < 7) { + output += ':'; + } + } + + return "[" + output + "]"; + } + + std::string ipv4(const uint64_t address) noexcept { + // Let output be the empty string. + std::string output{}; + + // Let n be the value of address. + auto n = address; + + // For each i in the range 1 to 4, inclusive: + for (size_t i = 1; i <= 4; i++) { + // Prepend n % 256, serialized, to output. + output.insert(0, std::to_string(n % 256)); + + // If i is not 4, then prepend U+002E (.) to output. + if (i != 4) { + output.insert(0, "."); + } + + // Set n to floor(n / 256). + n >>= 8; + } + + // Return output. + return output; + } + +} // namespace ada::serializers +/* end file src/serializers.cpp */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/src, filename=implementation.cpp +/* begin file src/implementation.cpp */ +#include + + +namespace ada { + + ada_warn_unused url parse(std::string_view input, + const ada::url* base_url, + ada::encoding_type encoding) { + if(encoding != encoding_type::UTF8) { + // @todo Add support for non UTF8 input + } + return ada::parser::parse_url(input, base_url, encoding); + } + + ada_warn_unused std::string to_string(ada::encoding_type type) { + switch(type) { + case ada::encoding_type::UTF8 : return "UTF-8"; + case ada::encoding_type::UTF_16LE : return "UTF-16LE"; + case ada::encoding_type::UTF_16BE : return "UTF-16BE"; + default: unreachable(); + } + } + +} // namespace ada +/* end file src/implementation.cpp */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/src, filename=helpers.cpp +/* begin file src/helpers.cpp */ + +#include +#include +#include +#include + +namespace ada::helpers { + + template + void encode_json(std::string_view view, out_iter out) { + // trivial implementation. could be faster. + const char * hexvalues = "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f"; + for(uint8_t c : view) { + if(c == '\\') { + *out++ = '\\'; *out++ = '\\'; + } else if(c == '"') { + *out++ = '\\'; *out++ = '"'; + } else if(c <= 0x1f) { + *out++ = '\\'; *out++= 'u'; *out++= '0'; *out++= '0'; + *out++ = hexvalues[2*c]; + *out++ = hexvalues[2*c+1]; + } else { + *out++ = c; + } + } + } + + ada_unused std::string get_state(ada::state s) { + switch (s) { + case ada::state::AUTHORITY: return "Authority"; + case ada::state::SCHEME_START: return "Scheme Start"; + case ada::state::SCHEME: return "Scheme"; + case ada::state::HOST: return "Host"; + case ada::state::NO_SCHEME: return "No Scheme"; + case ada::state::FRAGMENT: return "Fragment"; + case ada::state::RELATIVE_SCHEME: return "Relative Scheme"; + case ada::state::RELATIVE_SLASH: return "Relative Slash"; + case ada::state::FILE: return "File"; + case ada::state::FILE_HOST: return "File Host"; + case ada::state::FILE_SLASH: return "File Slash"; + case ada::state::PATH_OR_AUTHORITY: return "Path or Authority"; + case ada::state::SPECIAL_AUTHORITY_IGNORE_SLASHES: return "Special Authority Ignore Slashes"; + case ada::state::SPECIAL_AUTHORITY_SLASHES: return "Special Authority Slashes"; + case ada::state::SPECIAL_RELATIVE_OR_AUTHORITY: return "Special Relative or Authority"; + case ada::state::QUERY: return "Query"; + case ada::state::PATH: return "Path"; + case ada::state::PATH_START: return "Path Start"; + case ada::state::OPAQUE_PATH: return "Opaque Path"; + case ada::state::PORT: return "Port"; + default: return "unknown state"; + } + } + + ada_really_inline std::optional prune_fragment(std::string_view& input) noexcept { + // compiles down to 20--30 instructions including a class to memchr (C function). + // this function should be quite fast. + size_t location_of_first = input.find('#'); + if(location_of_first == std::string_view::npos) { return std::nullopt; } + std::string_view fragment = input; + fragment.remove_prefix(location_of_first+1); + input.remove_suffix(input.size() - location_of_first); + return fragment; + } + + ada_really_inline void shorten_path(ada::url &url) noexcept { + size_t first_delimiter = url.path.find_first_of('/', 1); + + // Let path be url’s path. + // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return. + if (url.get_scheme_type() == ada::scheme::type::FILE && first_delimiter == std::string_view::npos) { + if (checkers::is_normalized_windows_drive_letter(std::string_view(url.path.data() + 1, first_delimiter - 1))) { + return; + } + } + + // Remove path’s last item, if any. + if (!url.path.empty()) { + url.path.erase(url.path.rfind('/')); + } + } + + ada_really_inline void remove_ascii_tab_or_newline(std::string& input) noexcept { + // if this ever becomes a performance issue, we could use an approach similar to has_tabs_or_newline + input.erase(std::remove_if(input.begin(), input.end(), [](char c) { + return ada::unicode::is_ascii_tab_or_newline(c); + }), input.end()); + } + + ada_really_inline std::string_view substring(std::string_view input, size_t pos) noexcept { + ada_log("substring(", input, " [", input.size() ,"bytes],", pos, ")"); + return pos > input.size() ? std::string_view() : input.substr(pos); + } + + ada_really_inline size_t get_host_delimiter_location(const ada::url& url, std::string_view& view, bool& inside_brackets) noexcept { + size_t location = url.is_special() ? view.find_first_of(":[/?\\") : view.find_first_of(":[/?"); + + // Next while loop is almost never taken! + while((location != std::string_view::npos) && (view[location] == '[')) { + location = view.find(']',location); + if(location == std::string_view::npos) { + inside_brackets = true; + /** + * TODO: Ok. So if we arrive here then view has an unclosed [, + * Is the URL valid??? + */ + } else { + location = url.is_special() ? view.find_first_of(":[/?\\#", location) : view.find_first_of(":[/?#", location); + } + } + + if (location != std::string_view::npos) { + view.remove_suffix(view.size() - location); + } + return location; + } + + ada_really_inline void trim_c0_whitespace(std::string_view& input) noexcept { + while(!input.empty() && ada::unicode::is_c0_control_or_space(input.front())) { input.remove_prefix(1); } + while(!input.empty() && ada::unicode::is_c0_control_or_space(input.back())) { input.remove_suffix(1); } + } + +} // namespace ada::helpers + +namespace ada { + ada_warn_unused std::string to_string(ada::state state) { + return ada::helpers::get_state(state); + } +} +/* end file src/helpers.cpp */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/src, filename=url.cpp +/* begin file src/url.cpp */ + +#include +#include +#include + +namespace ada { + ada_really_inline bool url::parse_path(std::string_view input) { + ada_log("parse_path ", input); + std::string tmp_buffer; + std::string_view internal_input; + if(unicode::has_tabs_or_newline(input)) { + tmp_buffer = input; + // Optimization opportunity: Instead of copying and then pruning, we could just directly + // build the string from user_input. + helpers::remove_ascii_tab_or_newline(tmp_buffer); + internal_input = tmp_buffer; + } else { + internal_input = input; + } + + // If url is special, then: + if (is_special()) { + if(internal_input.empty()) { + path = "/"; + } else if((internal_input[0] == '/') ||(internal_input[0] == '\\')){ + return parse_prepared_path(internal_input.substr(1)); + } else { + return parse_prepared_path(internal_input); + } + } else if (!internal_input.empty()) { + if(internal_input[0] == '/') { + return parse_prepared_path(internal_input.substr(1)); + } else { + return parse_prepared_path(internal_input); + } + } else { + if(!host.has_value()) { + path = "/"; + } + } + return true; + } + + + ada_really_inline bool url::parse_prepared_path(std::string_view input) { + ada_log("parse_path ", input, " url: ", to_string()); + uint8_t accumulator = checkers::path_signature(input); + // Let us first detect a trivial case. + // If it is special, we check that we have no dot, no %, no \ and no + // character needing percent encoding. Otherwise, we check that we have no %, + // no dot, and no character needing percent encoding. + bool trivial_path = + (is_special() ? (accumulator == 0) : ((accumulator & 0b11111101) == 0)) && + (get_scheme_type() != ada::scheme::type::FILE); + if (trivial_path) { + ada_log("parse_path trivial"); + path += '/'; + path += input; + return true; + } + // We are going to need to look a bit at the path, but let us see if we can + // ignore percent encoding *and* \ characters. + bool fast_path = (is_special() && (accumulator & 0b11111011) == 0) && + (get_scheme_type() != ada::scheme::type::FILE); + if (fast_path) { + ada_log("parse_path fast"); + do { + // Here we don't need to worry about \\ or percent encoding. + size_t location = input.find('/'); + std::string_view path_view = input; + if (location != std::string_view::npos) { + path_view.remove_suffix(path_view.size() - location); + input.remove_prefix(location + 1); + } + if (unicode::is_double_dot_path_segment(path_view)) { + helpers::shorten_path(*this); + if (location == std::string_view::npos) { + path += '/'; + } + } else if (unicode::is_single_dot_path_segment(path_view) && + (location == std::string_view::npos)) { + path += '/'; + } + // Otherwise, if path_view is not a single-dot path segment, then: + else if (!unicode::is_single_dot_path_segment(path_view)) { + // If url’s scheme is "file", url’s path is empty, and path_view is a + // Windows drive letter, then replace the second code point in + // path_view with U+003A (:). + + // Append path_buffer to url’s path. + path += '/'; + path.append(path_view); + } + if (location == std::string_view::npos) { + return true; + } + + } while (true); + } else { + ada_log("parse_path slow"); + // we have reached the general case + bool needs_percent_encoding = (accumulator & 1); + std::string path_buffer_tmp; + do { + size_t location = (is_special() && (accumulator & 2)) + ? input.find_first_of("/\\") + : input.find('/'); + std::string_view path_view = input; + if (location != std::string_view::npos) { + path_view.remove_suffix(path_view.size() - location); + input.remove_prefix(location + 1); + } + // path_buffer is either path_view or it might point at a percent encoded temporary file. + std::string_view path_buffer = + (needs_percent_encoding + && ada::unicode::percent_encode(path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp)) ? + path_buffer_tmp : + path_view; + if (unicode::is_double_dot_path_segment(path_buffer)) { + helpers::shorten_path(*this); + if (location == std::string_view::npos) { + path += '/'; + } + } else if (unicode::is_single_dot_path_segment(path_buffer) && + (location == std::string_view::npos)) { + path += '/'; + } + // Otherwise, if path_buffer is not a single-dot path segment, then: + else if (!unicode::is_single_dot_path_segment(path_buffer)) { + // If url’s scheme is "file", url’s path is empty, and path_buffer is a + // Windows drive letter, then replace the second code point in + // path_buffer with U+003A (:). + if (get_scheme_type() == ada::scheme::type::FILE && path.empty() && + checkers::is_windows_drive_letter(path_buffer)) { + path += '/'; + path += path_buffer[0]; + path += ':'; + path_buffer.remove_prefix(2); + path.append(path_buffer); + } else { + // Append path_buffer to url’s path. + path += '/'; + path.append(path_buffer); + } + } + if (location == std::string_view::npos) { + return true; + } + } while (true); + } + } + + bool url::parse_opaque_host(std::string_view input) { + ada_log("parse_opaque_host ", input, "[", input.size(), " bytes]"); + if (std::any_of(input.begin(), input.end(), ada::unicode::is_forbidden_host_code_point)) { + return is_valid = false; + } + + // Return the result of running UTF-8 percent-encode on input using the C0 control percent-encode set. + host = ada::unicode::percent_encode(input, ada::character_sets::C0_CONTROL_PERCENT_ENCODE); + return true; + } + + bool url::parse_ipv4(std::string_view input) { + ada_log("parse_ipv4 ", input, "[", input.size(), " bytes]"); + if(input.back()=='.') { + input.remove_suffix(1); + } + size_t digit_count{0}; + uint64_t ipv4{0}; + // we could unroll for better performance? + for(;(digit_count < 4) && !(input.empty()); digit_count++) { + uint32_t result{}; // If any number exceeds 32 bits, we have an error. + bool is_hex = checkers::has_hex_prefix(input); + if(is_hex && ((input.length() == 2)|| ((input.length() > 2) && (input[2]=='.')))) { + // special case + result = 0; + input.remove_prefix(2); + } else { + std::from_chars_result r; + if(is_hex) { + r = std::from_chars(input.data() + 2, input.data() + input.size(), result, 16); + } else if ((input.length() >= 2) && input[0] == '0' && checkers::is_digit(input[1])) { + r = std::from_chars(input.data() + 1, input.data() + input.size(), result, 8); + } else { + r = std::from_chars(input.data(), input.data() + input.size(), result, 10); + } + if (r.ec != std::errc()) { return is_valid = false; } + input.remove_prefix(r.ptr-input.data()); + } + if(input.empty()) { + // We have the last value. + // At this stage, ipv4 contains digit_count*8 bits. + // So we have 32-digit_count*8 bits left. + if(result > (uint64_t(1)<<(32-digit_count*8))) { return is_valid = false; } + ipv4 <<=(32-digit_count*8); + ipv4 |= result; + goto final; + } else { + // There is more, so that the value must no be larger than 255 + // and we must have a '.'. + if ((result>255) || (input[0]!='.')) { return is_valid = false; } + ipv4 <<=8; + ipv4 |= result; + input.remove_prefix(1); // remove '.' + } + } + if((digit_count != 4) || (!input.empty())) {return is_valid = false; } + final: + // We could also check result.ptr to see where the parsing ended. + host = ada::serializers::ipv4(ipv4); + return true; + } + + bool url::parse_ipv6(std::string_view input) { + ada_log("parse_ipv6 ", input, "[", input.size(), " bytes]"); + + if(input.empty()) { return is_valid = false; } + // Let address be a new IPv6 address whose IPv6 pieces are all 0. + std::array address{}; + + // Let pieceIndex be 0. + int piece_index = 0; + + // Let compress be null. + std::optional compress{}; + + // Let pointer be a pointer for input. + std::string_view::iterator pointer = input.begin(); + + // If c is U+003A (:), then: + if (input[0] == ':') { + // If remaining does not start with U+003A (:), validation error, return failure. + if(input.size() == 1 || input[1] != ':') { + ada_log("parse_ipv6 starts with : but the rest does not start with :"); + return is_valid = false; + } + + // Increase pointer by 2. + pointer += 2; + + // Increase pieceIndex by 1 and then set compress to pieceIndex. + compress = ++piece_index; + } + + // While c is not the EOF code point: + while (pointer != input.end()) { + // If pieceIndex is 8, validation error, return failure. + if (piece_index == 8) { + ada_log("parse_ipv6 piece_index == 8"); + return is_valid = false; + } + + // If c is U+003A (:), then: + if (*pointer == ':') { + // If compress is non-null, validation error, return failure. + if (compress.has_value()) { + ada_log("parse_ipv6 compress is non-null"); + return is_valid = false; + } + + // Increase pointer and pieceIndex by 1, set compress to pieceIndex, and then continue. + pointer++; + compress = ++piece_index; + continue; + } + + // Let value and length be 0. + uint16_t value = 0, length = 0; + + // While length is less than 4 and c is an ASCII hex digit, + // set value to value × 0x10 + c interpreted as hexadecimal number, and increase pointer and length by 1. + while (length < 4 && pointer != input.end() && unicode::is_ascii_hex_digit(*pointer)) { + // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int + value = uint16_t(value * 0x10 + unicode::convert_hex_to_binary(*pointer)); + pointer++; + length++; + } + + // If c is U+002E (.), then: + if (pointer != input.end() && *pointer == '.') { + // If length is 0, validation error, return failure. + if (length == 0) { + ada_log("parse_ipv6 length is 0"); + return is_valid = false; + } + + // Decrease pointer by length. + pointer -= length; + + // If pieceIndex is greater than 6, validation error, return failure. + if (piece_index > 6) { + ada_log("parse_ipv6 piece_index > 6"); + return is_valid = false; + } + + // Let numbersSeen be 0. + int numbers_seen = 0; + + // While c is not the EOF code point: + while (pointer != input.end()) { + // Let ipv4Piece be null. + std::optional ipv4_piece{}; + + // If numbersSeen is greater than 0, then: + if (numbers_seen > 0) { + // If c is a U+002E (.) and numbersSeen is less than 4, then increase pointer by 1. + if (*pointer == '.' && numbers_seen < 4) { + pointer++; + } + // Otherwise, validation error, return failure. + else { + ada_log("parse_ipv6 Otherwise, validation error, return failure"); + return is_valid = false; + } + } + + // If c is not an ASCII digit, validation error, return failure. + if (pointer == input.end() || !checkers::is_digit(*pointer)) { + ada_log("parse_ipv6 If c is not an ASCII digit, validation error, return failure"); + return is_valid = false; + } + + // While c is an ASCII digit: + while (pointer != input.end() && checkers::is_digit(*pointer)) { + // Let number be c interpreted as decimal number. + int number = *pointer - '0'; + + // If ipv4Piece is null, then set ipv4Piece to number. + if (!ipv4_piece.has_value()) { + ipv4_piece = number; + } + // Otherwise, if ipv4Piece is 0, validation error, return failure. + else if (ipv4_piece == 0) { + ada_log("parse_ipv6 if ipv4Piece is 0, validation error"); + return is_valid = false; + } + // Otherwise, set ipv4Piece to ipv4Piece × 10 + number. + else { + ipv4_piece = *ipv4_piece * 10 + number; + } + + // If ipv4Piece is greater than 255, validation error, return failure. + if (ipv4_piece > 255) { + ada_log("parse_ipv6 ipv4_piece > 255"); + return is_valid = false; + } + + // Increase pointer by 1. + pointer++; + } + + // Set address[pieceIndex] to address[pieceIndex] × 0x100 + ipv4Piece. + // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int + address[piece_index] = uint16_t(address[piece_index] * 0x100 + *ipv4_piece); + + // Increase numbersSeen by 1. + numbers_seen++; + + // If numbersSeen is 2 or 4, then increase pieceIndex by 1. + if (numbers_seen == 2 || numbers_seen == 4) { + piece_index++; + } + } + + // If numbersSeen is not 4, validation error, return failure. + if (numbers_seen != 4) { + return is_valid = false; + } + + // Break. + break; + } + // Otherwise, if c is U+003A (:): + else if ((pointer != input.end()) && (*pointer == ':')) { + // Increase pointer by 1. + pointer++; + + // If c is the EOF code point, validation error, return failure. + if (pointer == input.end()) { + ada_log("parse_ipv6 If c is the EOF code point, validation error, return failure"); + return is_valid = false; + } + } + // Otherwise, if c is not the EOF code point, validation error, return failure. + else if (pointer != input.end()) { + ada_log("parse_ipv6 Otherwise, if c is not the EOF code point, validation error, return failure"); + return is_valid = false; + } + + // Set address[pieceIndex] to value. + address[piece_index] = value; + + // Increase pieceIndex by 1. + piece_index++; + } + + // If compress is non-null, then: + if (compress.has_value()) { + // Let swaps be pieceIndex − compress. + int swaps = piece_index - *compress; + + // Set pieceIndex to 7. + piece_index = 7; + + // While pieceIndex is not 0 and swaps is greater than 0, + // swap address[pieceIndex] with address[compress + swaps − 1], and then decrease both pieceIndex and swaps by 1. + while (piece_index != 0 && swaps > 0) { + std::swap(address[piece_index], address[*compress + swaps - 1]); + piece_index--; + swaps--; + } + } + // Otherwise, if compress is null and pieceIndex is not 8, validation error, return failure. + else if (piece_index != 8) { + ada_log("parse_ipv6 if compress is null and pieceIndex is not 8, validation error, return failure"); + return is_valid = false; + } + host = ada::serializers::ipv6(address); + ada_log("parse_ipv6 ", *host); + return true; + } + + ada_really_inline bool url::parse_host(std::string_view input) { + ada_log("parse_host ", input, "[", input.size(), " bytes]"); + if(input.empty()) { return is_valid = false; } // technically unnecessary. + // If input starts with U+005B ([), then: + if (input[0] == '[') { + // If input does not end with U+005D (]), validation error, return failure. + if (input.back() != ']') { + return is_valid = false; + } + ada_log("parse_host ipv6"); + + // Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed. + input.remove_prefix(1); + input.remove_suffix(1); + return parse_ipv6(input); + } + + // If isNotSpecial is true, then return the result of opaque-host parsing input. + if (!is_special()) { + return parse_opaque_host(input); + } + // Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input. + // Let asciiDomain be the result of running domain to ASCII with domain and false. + // The most common case is an ASCII input, in which case we do not need to call the expensive 'to_ascii' + // if a few conditions are met: no '%' and no 'xn-' subsequence. + std::string buffer; + uint8_t is_forbidden{0}; + + buffer.reserve(input.size()); + std::transform(input.begin(), input.end(), std::back_inserter(buffer), [&is_forbidden](char c) -> char { + is_forbidden |= ada::unicode::is_forbidden_domain_code_point(c); + return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);} + ); + if (is_forbidden == 0 && buffer.find("xn-") == std::string_view::npos) { + // fast path + host = std::move(buffer); + if (checkers::is_ipv4(host.value())) { + ada_log("parse_host fast path ipv4"); + return parse_ipv4(host.value()); + } + ada_log("parse_host fast path ", *host); + return true; + } + ada_log("parse_host calling to_ascii"); + is_valid = ada::unicode::to_ascii(host, input, false, input.find('%')); + if (!is_valid) { + ada_log("parse_host to_ascii returns false"); + return is_valid = false; + } + + // If asciiDomain ends in a number, then return the result of IPv4 parsing asciiDomain. + if(checkers::is_ipv4(host.value())) { + ada_log("parse_host got ipv4", *host); + return parse_ipv4(host.value()); + } + + return true; + } + + template + ada_really_inline bool url::parse_scheme(const std::string_view input) { + auto parsed_type = ada::scheme::get_scheme_type(input); + bool is_input_special = (parsed_type != ada::scheme::NOT_SPECIAL); + /** + * In the common case, we will immediately recognize a special scheme (e.g., http, https), + * in which case, we can go really fast. + **/ + if(is_input_special) { // fast path!!! + if (has_state_override) { + // If url’s scheme is not a special scheme and buffer is a special scheme, then return. + if (is_special() != is_input_special) { + return true; + } + + // If url includes credentials or has a non-null port, and buffer is "file", then return. + if ((includes_credentials() || port.has_value()) && parsed_type == ada::scheme::type::FILE) { + return true; + } + + // If url’s scheme is "file" and its host is an empty host, then return. + // An empty host is the empty string. + if (get_scheme_type() == ada::scheme::type::FILE && host.has_value() && host.value().empty()) { + return true; + } + } + + type = parsed_type; + + if (has_state_override) { + // This is uncommon. + uint16_t urls_scheme_port = get_special_port(); + + if (urls_scheme_port) { + // If url’s port is url’s scheme’s default port, then set url’s port to null. + if (port.has_value() && *port == urls_scheme_port) { + port = std::nullopt; + } + } + } + } else { // slow path + std::string _buffer; + // Optimization opportunity: Most of the time scheme's are all lowercase. + // If that's the case, there's no need to copy. + std::transform(input.begin(), input.end(), std::back_inserter(_buffer), + [](char c) -> char { return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);}); + + if (has_state_override) { + // If url’s scheme is a special scheme and buffer is not a special scheme, then return. + // If url’s scheme is not a special scheme and buffer is a special scheme, then return. + if (is_special() != ada::scheme::is_special(_buffer)) { + return true; + } + + // If url includes credentials or has a non-null port, and buffer is "file", then return. + if ((includes_credentials() || port.has_value()) && _buffer == "file") { + return true; + } + + // If url’s scheme is "file" and its host is an empty host, then return. + // An empty host is the empty string. + if (get_scheme_type() == ada::scheme::type::FILE && host.has_value() && host.value().empty()) { + return true; + } + } + + set_scheme(std::move(_buffer)); + + if (has_state_override) { + // This is uncommon. + uint16_t urls_scheme_port = get_special_port(); + + if (urls_scheme_port) { + // If url’s port is url’s scheme’s default port, then set url’s port to null. + if (port.has_value() && *port == urls_scheme_port) { + port = std::nullopt; + } + } + } + } + + return true; + } + + std::string url::to_string() const { + if (!is_valid) { + return "null"; + } + std::string answer; + auto back = std::back_insert_iterator(answer); + answer.append("{\n"); + answer.append("\t\"scheme\":\""); + helpers::encode_json(get_scheme(), back); + answer.append("\",\n"); + if(includes_credentials()) { + answer.append("\t\"username\":\""); + helpers::encode_json(username, back); + answer.append("\",\n"); + answer.append("\t\"password\":\""); + helpers::encode_json(password, back); + answer.append("\",\n"); + } + if(host.has_value()) { + answer.append("\t\"host\":\""); + helpers::encode_json(host.value(), back); + answer.append("\",\n"); + } + if(port.has_value()) { + answer.append("\t\"port\":\""); + answer.append(std::to_string(port.value())); + answer.append("\",\n"); + } + answer.append("\t\"path\":\""); + helpers::encode_json(path, back); + answer.append("\",\n"); + answer.append("\t\"opaque path\":"); + answer.append((has_opaque_path ? "true" : "false")); + if(query.has_value()) { + answer.append(",\n"); + answer.append("\t\"query\":\""); + helpers::encode_json(query.value(), back); + answer.append("\""); + } + if(fragment.has_value()) { + answer.append(",\n"); + answer.append("\t\"fragment\":\""); + helpers::encode_json(fragment.value(), back); + answer.append("\""); + } + answer.append("\n}"); + return answer; + } + + [[nodiscard]] bool url::has_valid_domain() const noexcept { + if(!host.has_value()) { return false; } + return checkers::verify_dns_length(host.value()); + } +} // namespace ada +/* end file src/url.cpp */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/src, filename=url-getters.cpp +/* begin file src/url-getters.cpp */ +/** + * @file url-getters.cpp + * Includes all the getters of `ada::url` + */ + +#include +#include + +namespace ada { + + [[nodiscard]] std::string url::get_href() const noexcept { + std::string output = get_protocol(); + size_t url_delimiter_count = std::count(path.begin(), path.end(), '/'); + + if (host.has_value()) { + output += "//"; + if (includes_credentials()) { + output += get_username(); + if (!get_password().empty()) { + output += ":" + get_password(); + } + output += "@"; + } + + output += get_host(); + } else if (!has_opaque_path && url_delimiter_count > 1 && path.length() >= 2 && path[0] == '/' && path[1] == '/') { + // If url’s host is null, url does not have an opaque path, url’s path’s size is greater than 1, + // and url’s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output. + output += "/."; + } + + output += get_pathname() + // If query is non-null, then set this’s query object’s list to the result of parsing query. + + (query.has_value() ? "?" + query.value() : "") + // If url’s fragment is non-null, then append U+0023 (#), followed by url’s fragment, to output. + + (fragment.has_value() ? "#" + fragment.value() : ""); + return output; + } + + [[nodiscard]] std::string url::get_origin() const noexcept { + if (is_special()) { + return get_protocol() + "//" + get_host(); + } + + if (get_scheme() == "blob") { + if (path.length() > 0) { + url result = ada::parser::parse_url(get_pathname()); + if (result.is_valid) { + if (result.is_special()) { + return result.get_protocol() + "//" + result.get_host(); + } + } + } + } + + // Return a new opaque origin. + return "null"; + } + + [[nodiscard]] std::string url::get_protocol() const noexcept { + return std::string(get_scheme()) + ":"; + } + + [[nodiscard]] std::string url::get_host() const noexcept { + // If url’s host is null, then return the empty string. + // If url’s port is null, return url’s host, serialized. + // Return url’s host, serialized, followed by U+003A (:) and url’s port, serialized. + if (!host.has_value()) { return ""; } + return host.value() + (port.has_value() ? ":" + get_port() : ""); + } + + [[nodiscard]] std::string url::get_hostname() const noexcept { + return host.value_or(""); + } + + [[nodiscard]] std::string url::get_pathname() const noexcept { + return path; + } + + [[nodiscard]] std::string url::get_search() const noexcept { + // If this’s URL’s query is either null or the empty string, then return the empty string. + // Return U+003F (?), followed by this’s URL’s query. + return (!query.has_value() || (query.value().empty())) ? "" : "?" + query.value(); + } + + [[nodiscard]] std::string url::get_username() const noexcept { + return username; + } + + [[nodiscard]] std::string url::get_password() const noexcept { + return password; + } + + [[nodiscard]] std::string url::get_port() const noexcept { + return port.has_value() ? std::to_string(port.value()) : ""; + } + + [[nodiscard]] std::string url::get_hash() const noexcept { + // If this’s URL’s fragment is either null or the empty string, then return the empty string. + // Return U+0023 (#), followed by this’s URL’s fragment. + return (!fragment.has_value() || (fragment.value().empty())) ? "" : "#" + fragment.value(); + } + +} // namespace ada +/* end file src/url-getters.cpp */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/src, filename=url-setters.cpp +/* begin file src/url-setters.cpp */ +/** + * @file url-setters.cpp + * Includes all the setters of `ada::url` + */ + +#include +#include + +namespace ada { + + bool url::set_username(const std::string_view input) { + if (cannot_have_credentials_or_port()) { return false; } + username = ada::unicode::percent_encode(input, character_sets::USERINFO_PERCENT_ENCODE); + return true; + } + + bool url::set_password(const std::string_view input) { + if (cannot_have_credentials_or_port()) { return false; } + password = ada::unicode::percent_encode(input, character_sets::USERINFO_PERCENT_ENCODE); + return true; + } + + bool url::set_port(const std::string_view input) { + if (cannot_have_credentials_or_port()) { return false; } + std::string trimmed(input); + helpers::remove_ascii_tab_or_newline(trimmed); + if (trimmed.empty()) { port = std::nullopt; return true; } + // Input should not start with control characters. + if (ada::unicode::is_c0_control_or_space(trimmed.front())) { return false; } + return parse_port(trimmed); + } + + void url::set_hash(const std::string_view input) { + if (input.empty()) { + fragment = std::nullopt; + // TODO: Potentially strip trailing spaces from an opaque path with this. + return; + } + + std::string new_value; + new_value = input[0] == '#' ? input.substr(1) : input; + helpers::remove_ascii_tab_or_newline(new_value); + fragment = unicode::percent_encode(new_value, ada::character_sets::FRAGMENT_PERCENT_ENCODE); + return; + } + + void url::set_search(const std::string_view input) { + if (input.empty()) { + query = std::nullopt; + // Empty this’s query object’s list. + // @todo Implement this if/when we have URLSearchParams. + // Potentially strip trailing spaces from an opaque path with this. + return; + } + + std::string new_value; + new_value = input[0] == '?' ? input.substr(1) : input; + helpers::remove_ascii_tab_or_newline(new_value); + + auto query_percent_encode_set = is_special() ? + ada::character_sets::SPECIAL_QUERY_PERCENT_ENCODE : + ada::character_sets::QUERY_PERCENT_ENCODE; + + query = ada::unicode::percent_encode(std::string_view(new_value), query_percent_encode_set); + + // Set this’s query object’s list to the result of parsing input. + // @todo Implement this if/when we have URLSearchParams. + return ; + } + + bool url::set_pathname(const std::string_view input) { + if (has_opaque_path) { return false; } + path = ""; + return parse_path(input); + } + + bool url::set_host(const std::string_view input) { + if (has_opaque_path) { return false; } + + std::optional previous_host = host; + std::optional previous_port = port; + + std::string_view::iterator _host_end = std::find(input.begin(), input.end(), '#'); + std::string _host(input.data(), std::distance(input.begin(), _host_end)); + helpers::remove_ascii_tab_or_newline(_host); + std::string_view new_host(_host); + + // If url's scheme is "file", then set state to file host state, instead of host state. + if (get_scheme_type() != ada::scheme::type::FILE) { + std::string_view host_view(_host.data(), _host.length()); + bool inside_brackets{false}; + size_t location = helpers::get_host_delimiter_location(*this, host_view, inside_brackets); + std::string_view::iterator pointer = (location != std::string_view::npos) ? new_host.begin() + location : new_host.end(); + + // Otherwise, if c is U+003A (:) and insideBrackets is false, then: + // Note: we cannot access *pointer safely if (pointer == pointer_end). + if ((pointer != new_host.end()) && (*pointer == ':') && !inside_brackets) { + // TODO: The next 2 lines is the only difference between set_host and set_hostname. Let's simplify it. + std::string_view buffer(&*(pointer + 1)); + if (!buffer.empty()) { set_port(buffer); } + } + // If url is special and host_view is the empty string, validation error, return failure. + // Otherwise, if state override is given, host_view is the empty string, + // and either url includes credentials or url’s port is non-null, return. + else if (host_view.empty() && (is_special() || includes_credentials() || port.has_value())) { + return false; + } + + // Let host be the result of host parsing host_view with url is not special. + if (host_view.empty()) { + host = ""; + return true; + } + + bool succeeded = parse_host(host_view); + if (!succeeded) { + host = previous_host; + port = previous_port; + } + return succeeded; + } + + size_t location = new_host.find_first_of("/\\?"); + if (location != std::string_view::npos) { new_host.remove_suffix(new_host.length() - location); } + + if (new_host.empty()) { + // Set url’s host to the empty string. + host = ""; + } + else { + // Let host be the result of host parsing buffer with url is not special. + if (!parse_host(new_host)) { + host = previous_host; + port = previous_port; + return false; + } + + // If host is "localhost", then set host to the empty string. + if (host.has_value() && host.value() == "localhost") { + host = ""; + } + } + return true; + } + + bool url::set_hostname(const std::string_view input) { + if (has_opaque_path) { return false; } + + std::optional previous_host = host; + + std::string_view::iterator input_pointer_end = std::find(input.begin(), input.end(), '#'); + std::string _host(input.data(), std::distance(input.begin(), input_pointer_end)); + helpers::remove_ascii_tab_or_newline(_host); + std::string_view new_host(_host); + + // If url's scheme is "file", then set state to file host state, instead of host state. + if (get_scheme_type() != ada::scheme::type::FILE) { + std::string_view host_view(_host.data(), _host.length()); + bool inside_brackets{false}; + size_t location = helpers::get_host_delimiter_location(*this, host_view, inside_brackets); + std::string_view::iterator pointer = (location != std::string_view::npos) ? new_host.begin() + location : new_host.end(); + + // Otherwise, if c is U+003A (:) and insideBrackets is false, then: + // Note: we cannot access *pointer safely if (pointer == pointer_end). + if ((pointer != new_host.end()) && (*pointer == ':') && !inside_brackets) { + // If buffer is the empty string, validation error, return failure. + return false; + } + // If url is special and host_view is the empty string, validation error, return failure. + else if (host_view.empty() && is_special()) { + return false; + } + // Otherwise, if state override is given, host_view is the empty string, + // and either url includes credentials or url’s port is non-null, return. + else if (host_view.empty() && (includes_credentials() || port.has_value())) { + return true; + } + + // Let host be the result of host parsing host_view with url is not special. + if (host_view.empty()) { + host = ""; + return true; + } + + bool succeeded = parse_host(host_view); + if (!succeeded) { host = previous_host; } + return succeeded; + } + + size_t location = new_host.find_first_of("/\\?"); + if (location != std::string_view::npos) { new_host.remove_suffix(new_host.length() - location); } + + if (new_host.empty()) { + // Set url’s host to the empty string. + host = ""; + } + else { + // Let host be the result of host parsing buffer with url is not special. + if (!parse_host(new_host)) { + host = previous_host; + return false; + } + + // If host is "localhost", then set host to the empty string. + if (host.has_value() && host.value() == "localhost") { + host = ""; + } + } + return true; + } + + bool url::set_protocol(const std::string_view input) { + std::string view(input); + helpers::remove_ascii_tab_or_newline(view); + if (view.empty()) { return true; } + + // Schemes should start with alpha values. + if (!checkers::is_alpha(view[0])) { return false; } + + view.append(":"); + + std::string::iterator pointer = std::find_if_not(view.begin(), view.end(), unicode::is_alnum_plus); + + if (pointer != view.end() && *pointer == ':') { + return parse_scheme(std::string_view(view.data(), pointer - view.begin())); + } + return false; + } + + bool url::set_href(const std::string_view input) { + ada::url out = ada::parse(input); + + if (out.is_valid) { + set_protocol(out.get_protocol()); + set_username(out.get_username()); + set_password(out.get_password()); + set_host(out.get_host()); + set_hostname(out.get_hostname()); + set_port(out.get_port()); + set_pathname(out.get_pathname()); + set_hash(out.get_hash()); + set_search(out.get_search()); + } + + return out.is_valid; + } + +} // namespace ada +/* end file src/url-setters.cpp */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/src, filename=parser.cpp +/* begin file src/parser.cpp */ + +#include +#include +#include +#include +#include + +#include +#include + +namespace ada::parser { + + url parse_url(std::string_view user_input, + const ada::url* base_url, + ada::encoding_type encoding) { + ada_log("ada::parser::parse_url('", user_input, + "' [", user_input.size()," bytes],", (base_url != nullptr ? base_url->to_string() : "null"), + ",", ada::to_string(encoding), ")"); + + ada::state state = ada::state::SCHEME_START; + ada::url url = ada::url(); + + // If we are provided with an invalid base, or the optional_url was invalid, + // we must return. + if(base_url != nullptr) { url.is_valid &= base_url->is_valid; } + if(!url.is_valid) { return url; } + + std::string tmp_buffer; + std::string_view internal_input; + if(unicode::has_tabs_or_newline(user_input)) { + tmp_buffer = user_input; + // Optimization opportunity: Instead of copying and then pruning, we could just directly + // build the string from user_input. + helpers::remove_ascii_tab_or_newline(tmp_buffer); + internal_input = tmp_buffer; + } else { + internal_input = user_input; + } + + // Leading and trailing control characters are uncommon and easy to deal with (no performance concern). + std::string_view url_data = internal_input; + helpers::trim_c0_whitespace(url_data); + + // Optimization opportunity. Most websites do not have fragment. + std::optional fragment = helpers::prune_fragment(url_data); + if(fragment.has_value()) { + url.fragment = unicode::percent_encode(*fragment, + ada::character_sets::FRAGMENT_PERCENT_ENCODE); + } + + // Here url_data no longer has its fragment. + // We are going to access the data from url_data (it is immutable). + // At any given time, we are pointing at byte 'input_position' in url_data. + // The input_position variable should range from 0 to input_size. + // It is illegal to access url_data at input_size. + size_t input_position = 0; + const size_t input_size = url_data.size(); + // Keep running the following state machine by switching on state. + // If after a run pointer points to the EOF code point, go to the next step. + // Otherwise, increase pointer by 1 and continue with the state machine. + for (; input_position <= input_size; input_position++) { + switch (state) { + case ada::state::SCHEME_START: { + ada_log("SCHEME_START ", helpers::substring(url_data, input_position)); + // If c is an ASCII alpha, append c, lowercased, to buffer, and set state to scheme state. + if ((input_position != input_size) && checkers::is_alpha(url_data[input_position])) { + state = ada::state::SCHEME; + goto goto_scheme; + } + // Otherwise, if state override is not given, set state to no scheme state and decrease pointer by 1. + else { + state = ada::state::NO_SCHEME; + goto goto_no_scheme;; + } + break; + } + case ada::state::SCHEME: { + goto_scheme: + ada_log("SCHEME ", helpers::substring(url_data, input_position)); + // If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.), append c, lowercased, to buffer. + while((input_position != input_size) && (ada::unicode::is_alnum_plus(url_data[input_position]))) { + input_position++; + } + // Otherwise, if c is U+003A (:), then: + if ((input_position != input_size) && (url_data[input_position] == ':')) { + ada_log("SCHEME the scheme should be ", url_data.substr(0,input_position)); + if(!url.parse_scheme(url_data.substr(0,input_position))) { return url; } + ada_log("SCHEME the scheme is ", url.get_scheme()); + // If url’s scheme is "file", then: + if (url.get_scheme_type() == ada::scheme::type::FILE) { + // Set state to file state. + state = ada::state::FILE; + } + // Otherwise, if url is special, base is non-null, and base’s scheme is url’s scheme: + // Note: Doing base_url->scheme is unsafe if base_url != nullptr is false. + else if (url.is_special() && base_url != nullptr && base_url->get_scheme_type() == url.get_scheme_type()) { + // Set state to special relative or authority state. + state = ada::state::SPECIAL_RELATIVE_OR_AUTHORITY; + } + // Otherwise, if url is special, set state to special authority slashes state. + else if (url.is_special()) { + state = ada::state::SPECIAL_AUTHORITY_SLASHES; + } + // Otherwise, if remaining starts with an U+002F (/), set state to path or authority state + // and increase pointer by 1. + else if (input_position + 1 < input_size && url_data[input_position + 1] == '/') { + state = ada::state::PATH_OR_AUTHORITY; + input_position++; + } + // Otherwise, set url’s path to the empty string and set state to opaque path state. + else { + state = ada::state::OPAQUE_PATH; + } + } + // Otherwise, if state override is not given, set buffer to the empty string, state to no scheme state, + // and start over (from the first code point in input). + else { + state = ada::state::NO_SCHEME; + input_position = 0; + goto goto_no_scheme; + } + break; + } + case ada::state::NO_SCHEME: { + goto_no_scheme: + ada_log("NO_SCHEME ", helpers::substring(url_data, input_position)); + // If base is null, or base has an opaque path and c is not U+0023 (#), validation error, return failure. + if (base_url == nullptr || (base_url->has_opaque_path && (input_position != input_size))) { + ada_log("NO_SCHEME validation error"); + url.is_valid = false; + return url; + } + // Otherwise, if base has an opaque path and c is U+0023 (#), + // set url’s scheme to base’s scheme, url’s path to base’s path, url’s query to base’s query, + // url’s fragment to the empty string, and set state to fragment state. + else if (base_url->has_opaque_path && url.fragment.has_value() && input_position == input_size) { + ada_log("NO_SCHEME opaque base with fragment"); + url.copy_scheme(*base_url); + url.path = base_url->path; + url.has_opaque_path = base_url->has_opaque_path; + url.query = base_url->query; + return url; + } + // Otherwise, if base’s scheme is not "file", set state to relative state and decrease pointer by 1. + else if (base_url->get_scheme_type() != ada::scheme::type::FILE) { + ada_log("NO_SCHEME non-file relative path"); + state = ada::state::RELATIVE_SCHEME; + goto goto_relative_scheme; + } + // Otherwise, set state to file state and decrease pointer by 1. + else { + ada_log("NO_SCHEME file base type"); + state = ada::state::FILE; + goto goto_file; + } + break; + } + case ada::state::AUTHORITY: { + goto_authority: + ada_log("AUTHORITY ", helpers::substring(url_data, input_position)); + // most URLs have no @. Having no @ tells us that we don't have to worry about AUTHORITY. Of course, + // we could have @ and still not have to worry about AUTHORITY. + // TODO: Instead of just collecting a bool, collect the location of the '@' and do something useful with it. + // TODO: We could do various processing early on, using a single pass over the string to collect + // information about it, e.g., telling us whether there is a @ and if so, where (or how many). + const bool contains_ampersand = (url_data.find('@', input_position) != std::string_view::npos); + + if(!contains_ampersand) { + state = ada::state::HOST; + goto goto_host; + } + bool at_sign_seen{false}; + bool password_token_seen{false}; + do { + std::string_view view = helpers::substring(url_data, input_position); + size_t location = url.is_special() ? view.find_first_of("@/?\\") : view.find_first_of("@/?"); + std::string_view authority_view(view.data(), (location != std::string_view::npos) ? location : view.size()); + input_position = (location == std::string_view::npos) ? input_size : input_position + location; + // If c is U+0040 (@), then: + if ((input_position != input_size) && (url_data[input_position] == '@')) { + // If atSignSeen is true, then prepend "%40" to buffer. + if (at_sign_seen) { + if (password_token_seen) { + url.password += "%40"; + } else { + url.username += "%40"; + } + } + + at_sign_seen = true; + + if (!password_token_seen) { + size_t password_token_location = authority_view.find(':'); + password_token_seen = password_token_location != std::string_view::npos; + + if (!password_token_seen) { + url.username += unicode::percent_encode(authority_view, character_sets::USERINFO_PERCENT_ENCODE); + } else { + url.username += unicode::percent_encode(authority_view.substr(0,password_token_location), character_sets::USERINFO_PERCENT_ENCODE); + url.password += unicode::percent_encode(authority_view.substr(password_token_location+1), character_sets::USERINFO_PERCENT_ENCODE); + } + } + else { + url.password += unicode::percent_encode(authority_view, character_sets::USERINFO_PERCENT_ENCODE); + } + } + // Otherwise, if one of the following is true: + // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#) + // - url is special and c is U+005C (\) + else if (input_position == input_size || url_data[input_position] == '/' || url_data[input_position] == '?' || (url.is_special() && url_data[input_position] == '\\')) { + // If atSignSeen is true and authority_view is the empty string, validation error, return failure. + if (at_sign_seen && authority_view.empty()) { + url.is_valid = false; + return url; + } + // Decrease pointer by the number of code points in buffer plus one, + // set buffer to the empty string, and set state to host state. + input_position -= authority_view.length() + 1; + state = ada::state::HOST; + break; + } + if(input_position == input_size) { break; } + input_position++; + } while(true); + break; + } + case ada::state::SPECIAL_RELATIVE_OR_AUTHORITY: { + ada_log("SPECIAL_RELATIVE_OR_AUTHORITY ", helpers::substring(url_data, input_position)); + // If c is U+002F (/) and remaining starts with U+002F (/), + // then set state to special authority ignore slashes state and increase pointer by 1. + std::string_view view = helpers::substring(url_data, input_position); + if (ada::checkers::begins_with(view, "//")) { + state = ada::state::SPECIAL_AUTHORITY_IGNORE_SLASHES; + input_position++; + } + // Otherwise, validation error, set state to relative state and decrease pointer by 1. + else { + state = ada::state::RELATIVE_SCHEME; + goto goto_relative_scheme; + } + + break; + } + case ada::state::PATH_OR_AUTHORITY: { + ada_log("PATH_OR_AUTHORITY ", helpers::substring(url_data, input_position)); + // If c is U+002F (/), then set state to authority state. + if ((input_position != input_size) && (url_data[input_position] == '/')) { + state = ada::state::AUTHORITY; + } + // Otherwise, set state to path state, and decrease pointer by 1. + else { + state = ada::state::PATH; + goto goto_path; + } + + break; + } + case ada::state::RELATIVE_SCHEME: { + goto_relative_scheme: + ada_log("RELATIVE_SCHEME ", helpers::substring(url_data, input_position)); + // Set url’s scheme to base’s scheme. + url.copy_scheme(*base_url); + // If c is U+002F (/), then set state to relative slash state. + if ((input_position != input_size) && (url_data[input_position] == '/')) { + ada_log("RELATIVE_SCHEME if c is U+002F (/), then set state to relative slash state"); + state = ada::state::RELATIVE_SLASH; + } + // Otherwise, if url is special and c is U+005C (\), validation error, set state to relative slash state. + else if (url.is_special() && (input_position != input_size) && (url_data[input_position] == '\\')) { + ada_log("RELATIVE_SCHEME if url is special and c is U+005C, validation error, set state to relative slash state"); + state = ada::state::RELATIVE_SLASH; + } + // Otherwise: + else { + ada_log("RELATIVE_SCHEME otherwise"); + // Set url’s username to base’s username, url’s password to base’s password, url’s host to base’s host, + // url’s port to base’s port, url’s path to a clone of base’s path, and url’s query to base’s query. + url.username = base_url->username; + url.password = base_url->password; + url.host = base_url->host; + url.port = base_url->port; + url.path = base_url->path; + url.has_opaque_path = base_url->has_opaque_path; + url.query = base_url->query; + + // If c is U+003F (?), then set url’s query to the empty string, and state to query state. + if ((input_position != input_size) && (url_data[input_position] == '?')) { + url.query = ""; + state = ada::state::QUERY; + } + // Otherwise, if c is not the EOF code point: + else if (input_position != input_size) { + // Set url’s query to null. + url.query = std::nullopt; + + // Shorten url’s path. + helpers::shorten_path(url); + + // Set state to path state and decrease pointer by 1. + state = ada::state::PATH; + goto goto_path; + } + } + break; + } + case ada::state::RELATIVE_SLASH: { + ada_log("RELATIVE_SLASH ", helpers::substring(url_data, input_position)); + // If url is special and c is U+002F (/) or U+005C (\), then: + if (url.is_special() && (input_position != input_size) && (url_data[input_position] == '/' || url_data[input_position] =='\\')) { + // Set state to special authority ignore slashes state. + state = ada::state::SPECIAL_AUTHORITY_IGNORE_SLASHES; + } + // Otherwise, if c is U+002F (/), then set state to authority state. + else if ((input_position != input_size) && (url_data[input_position] == '/')) { + state = ada::state::AUTHORITY; + } + // Otherwise, set + // - url’s username to base’s username, + // - url’s password to base’s password, + // - url’s host to base’s host, + // - url’s port to base’s port, + // - state to path state, and then, decrease pointer by 1. + else { + url.username = base_url->username; + url.password = base_url->password; + url.host = base_url->host; + url.port = base_url->port; + state = ada::state::PATH; + goto goto_path; + } + + break; + } + case ada::state::SPECIAL_AUTHORITY_SLASHES: { + ada_log("SPECIAL_AUTHORITY_SLASHES ", helpers::substring(url_data, input_position)); + + // If c is U+002F (/) and remaining starts with U+002F (/), + // then set state to special authority ignore slashes state and increase pointer by 1. + state = ada::state::SPECIAL_AUTHORITY_IGNORE_SLASHES; + std::string_view view = helpers::substring(url_data, input_position); + if (ada::checkers::begins_with(view, "//")) { + input_position++; + } + // Otherwise, validation error, set state to special authority ignore slashes state and decrease pointer by 1. + else { + goto goto_special_authority_ignore_slashes; + } + + + break; /** Here we should just fall through !!! */ + } + case ada::state::SPECIAL_AUTHORITY_IGNORE_SLASHES: { + goto_special_authority_ignore_slashes: + ada_log("SPECIAL_AUTHORITY_IGNORE_SLASHES ", helpers::substring(url_data, input_position)); + + // If c is neither U+002F (/) nor U+005C (\), then set state to authority state and decrease pointer by 1. + while(true) { + if ((input_position == input_size) || ((url_data[input_position] != '/') && (url_data[input_position] != '\\'))) { + state = ada::state::AUTHORITY; + goto goto_authority; + } + input_position++; + } + + break; + } + case ada::state::QUERY: { + ada_log("QUERY ", helpers::substring(url_data, input_position)); + // If encoding is not UTF-8 and one of the following is true: + // - url is not special + // - url’s scheme is "ws" or "wss" + if (encoding != ada::encoding_type::UTF8) { + if (!url.is_special() || url.get_scheme_type() == ada::scheme::type::WS || url.get_scheme_type() == ada::scheme::type::WSS) { + // then set encoding to UTF-8. + encoding = ada::encoding_type::UTF8; + } + } + // Let queryPercentEncodeSet be the special-query percent-encode set if url is special; + // otherwise the query percent-encode set. + auto query_percent_encode_set = url.is_special() ? + ada::character_sets::SPECIAL_QUERY_PERCENT_ENCODE : + ada::character_sets::QUERY_PERCENT_ENCODE; + + // Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, + // and append the result to url’s query. + url.query = ada::unicode::percent_encode(helpers::substring(url_data, input_position), query_percent_encode_set); + + return url; + } + case ada::state::HOST: { + goto_host: + ada_log("HOST ", helpers::substring(url_data, input_position)); + std::string_view host_view = helpers::substring(url_data, input_position); + bool inside_brackets{false}; + size_t location = helpers::get_host_delimiter_location(url, host_view, inside_brackets); + input_position = (location != std::string_view::npos) ? input_position + location : input_size; + // Otherwise, if c is U+003A (:) and insideBrackets is false, then: + if ((input_position != input_size) && (url_data[input_position] == ':') && !inside_brackets) { + // If buffer is the empty string, validation error, return failure. + // Let host be the result of host parsing buffer with url is not special. + ada_log("HOST parsing ", host_view); + if(!url.parse_host(host_view)) { return url; } + ada_log("HOST parsing results in ", url.host.has_value() ? "none" : url.host.value()); + // Set url’s host to host, buffer to the empty string, and state to port state. + state = ada::state::PORT; + } + // Otherwise, if one of the following is true: + // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#) + // - url is special and c is U+005C (\) + else if (input_position == input_size || url_data[input_position] == '/' || url_data[input_position] == '?' || (url.is_special() && url_data[input_position] == '\\')) { + + // If url is special and host_view is the empty string, validation error, return failure. + if (url.is_special() && host_view.empty()) { + url.is_valid = false; + return url; + } + + // Let host be the result of host parsing host_view with url is not special. + if (host_view.empty()) { + url.host = ""; + } else { + if(!url.parse_host(host_view)) { return url; } + } + // Set url’s host to host, and state to path start state. + state = ada::state::PATH_START; + goto goto_path_start; + } + break; + } + case ada::state::OPAQUE_PATH: { + ada_log("OPAQUE_PATH ", helpers::substring(url_data, input_position)); + std::string_view view = helpers::substring(url_data, input_position); + // If c is U+003F (?), then set url’s query to the empty string and state to query state. + size_t location = view.find('?'); + if(location != std::string_view::npos) { + view.remove_suffix(view.size() - location); + state = ada::state::QUERY; + input_position += location; + } else { + input_position = input_size; + } + url.has_opaque_path = true; + url.path = unicode::percent_encode(view, character_sets::C0_CONTROL_PERCENT_ENCODE); + break; + } + case ada::state::PORT: { + ada_log("PORT ", helpers::substring(url_data, input_position)); + std::string_view port_view = helpers::substring(url_data, input_position); + size_t consumed_bytes = url.parse_port(port_view, true); + input_position += consumed_bytes; + if(!url.is_valid) { return url; } + goto goto_path_start; + } + case ada::state::PATH_START: { + goto_path_start: + ada_log("PATH_START ", helpers::substring(url_data, input_position)); + // If url is special, then: + if (url.is_special()) { + + // Set state to path state. + state = ada::state::PATH; + + // Optimization: Avoiding going into PATH state improves the performance of urls ending with /. + if (input_position == input_size) { + url.path = "/"; + return url; + } + // If c is neither U+002F (/) nor U+005C (\), then decrease pointer by 1. + // We know that (input_position == input_size) is impossible here, because of the previous if-check. + if ((url_data[input_position] != '/') && (url_data[input_position] != '\\')) { + goto goto_path; + } + + } + // Otherwise, if state override is not given and c is U+003F (?), + // set url’s query to the empty string and state to query state. + else if ((input_position < input_size) && (url_data[input_position] == '?')) { + state = ada::state::QUERY; + } + // Otherwise, if c is not the EOF code point: + else if (input_position != input_size) { + // Set state to path state. + state = ada::state::PATH; + + // If c is not U+002F (/), then decrease pointer by 1. + if (url_data[input_position] != '/') { + goto goto_path; + } + } + + break; + } + case ada::state::PATH: { + goto_path: + std::string_view view = helpers::substring(url_data, input_position); + ada_log("PATH ", helpers::substring(url_data, input_position)); + + // Most time, we do not need percent encoding. + // Furthermore, we can immediately locate the '?'. + size_t locofquestionmark = view.find('?'); + if(locofquestionmark != std::string_view::npos) { + state = ada::state::QUERY; + view.remove_suffix(view.size()-locofquestionmark); + input_position += locofquestionmark; + } else { + input_position = input_size; + } + if(!url.parse_prepared_path(view)) { return url; } + break; + } + case ada::state::FILE_SLASH: { + ada_log("FILE_SLASH ", helpers::substring(url_data, input_position)); + + // If c is U+002F (/) or U+005C (\), then: + if ((input_position != input_size) && (url_data[input_position] == '/' || url_data[input_position] == '\\')) { + ada_log("FILE_SLASH c is U+002F or U+005C"); + // Set state to file host state. + state = ada::state::FILE_HOST; + } + // Otherwise: + else { + ada_log("FILE_SLASH otherwise"); + // If base is non-null and base’s scheme is "file", then: + // Note: it is unsafe to do base_url->scheme unless you know that + // base_url_has_value() is true. + if (base_url != nullptr && base_url->get_scheme_type() == ada::scheme::type::FILE) { + // Set url’s host to base’s host. + url.host = base_url->host; + + // If the code point substring from pointer to the end of input does not start with + // a Windows drive letter and base’s path[0] is a normalized Windows drive letter, + // then append base’s path[0] to url’s path. + if (!base_url->path.empty()) { + if (!checkers::is_windows_drive_letter(helpers::substring(url_data, input_position))) { + std::string_view first_base_url_path = base_url->path; + first_base_url_path.remove_prefix(1); + size_t loc = first_base_url_path.find('/'); + if(loc != std::string_view::npos) { + first_base_url_path.remove_suffix(first_base_url_path.size() - loc); + } + // Optimization opportunity: Get rid of initializing a std::string + if (checkers::is_normalized_windows_drive_letter(first_base_url_path)) { + url.path += '/'; + url.path += first_base_url_path; + } + } + } + } + + // Set state to path state, and decrease pointer by 1. + state = ada::state::PATH; + goto goto_path; + } + + break; + } + case ada::state::FILE_HOST: { + std::string_view view = helpers::substring(url_data, input_position); + ada_log("FILE_HOST ", helpers::substring(url_data, input_position)); + + size_t location = view.find_first_of("/\\?"); + std::string_view file_host_buffer(view.data(), (location != std::string_view::npos) ? location : view.size()); + if (checkers::is_windows_drive_letter(file_host_buffer)) { + state = ada::state::PATH; + goto goto_path; + } + else if (file_host_buffer.empty()) { + // Set url’s host to the empty string. + url.host = ""; + // Set state to path start state. + state = ada::state::PATH_START; + goto goto_path_start; + } + else { + size_t consumed_bytes = file_host_buffer.size(); + input_position += consumed_bytes; + // Let host be the result of host parsing buffer with url is not special. + if(!url.parse_host(file_host_buffer)) { return url; } + + // If host is "localhost", then set host to the empty string. + if (url.host.has_value() && url.host.value() == "localhost") { + url.host = ""; + } + + // Set buffer to the empty string and state to path start state. + state = ada::state::PATH_START; + goto goto_path_start; + } + + break; + } + case ada::state::FILE: { + goto_file: + ada_log("FILE ", helpers::substring(url_data, input_position)); + std::string_view file_view = helpers::substring(url_data, input_position); + + // Set url’s scheme to "file". + url.set_scheme("file"); + + // Set url’s host to the empty string. + url.host = ""; + + // If c is U+002F (/) or U+005C (\), then: + if (input_position != input_size && (url_data[input_position] == '/' || url_data[input_position] == '\\')) { + ada_log("FILE c is U+002F or U+005C"); + // Set state to file slash state. + state = ada::state::FILE_SLASH; + } + // Otherwise, if base is non-null and base’s scheme is "file": + else if (base_url != nullptr && base_url->get_scheme_type() == ada::scheme::type::FILE) { + // Set url’s host to base’s host, url’s path to a clone of base’s path, and url’s query to base’s query. + ada_log("FILE base non-null"); + url.host = base_url->host; + url.path = base_url->path; + url.has_opaque_path = base_url->has_opaque_path; + url.query = base_url->query; + + // If c is U+003F (?), then set url’s query to the empty string and state to query state. + if (input_position != input_size && url_data[input_position] == '?') { + state = ada::state::QUERY; + } + // Otherwise, if c is not the EOF code point: + else if (input_position != input_size) { + // Set url’s query to null. + url.query = std::nullopt; + + // If the code point substring from pointer to the end of input does not start with a + // Windows drive letter, then shorten url’s path. + if (!checkers::is_windows_drive_letter(file_view)) { + helpers::shorten_path(url); + } + // Otherwise: + else { + // Set url’s path to an empty list. + url.path.clear(); + url.has_opaque_path = true; + } + + // Set state to path state and decrease pointer by 1. + state = ada::state::PATH; + goto goto_path; + } + } + // Otherwise, set state to path state, and decrease pointer by 1. + else { + ada_log("FILE go to path"); + state = ada::state::PATH; + goto goto_path; + } + + break; + } + default: + ada::unreachable(); + } + } + ada_log("returning ", url.to_string()); + return url; + } + +} // namespace ada::parser +/* end file src/parser.cpp */ +/* end file src/ada.cpp */ diff --git a/deps/ada/ada.gyp b/deps/ada/ada.gyp new file mode 100644 index 00000000000000..dd2183239462c4 --- /dev/null +++ b/deps/ada/ada.gyp @@ -0,0 +1,29 @@ +{ + 'variables': { + 'v8_enable_i18n_support%': 1, + }, + 'targets': [ + { + 'target_name': 'ada', + 'type': 'static_library', + 'include_dirs': ['.'], + 'direct_dependent_settings': { + 'include_dirs': ['.'], + }, + 'sources': ['ada.cpp'], + 'conditions': [ + ['v8_enable_i18n_support==1', { + 'dependencies': [ + '<(icu_gyp_path):icui18n', + '<(icu_gyp_path):icuuc', + ], + }], + ['OS=="win" and v8_enable_i18n_support==1', { + 'dependencies': [ + '<(icu_gyp_path):icudata', + ], + }], + ] + }, + ] +} diff --git a/deps/ada/ada.h b/deps/ada/ada.h new file mode 100644 index 00000000000000..fdb3f383334437 --- /dev/null +++ b/deps/ada/ada.h @@ -0,0 +1,1846 @@ +/* auto-generated on 2023-01-30 11:28:20 -0500. Do not edit! */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada.h +/* begin file include/ada.h */ +#ifndef ADA_H +#define ADA_H + +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/character_sets.h +/* begin file include/ada/character_sets.h */ +/** + * @file character_sets.h + * @brief Definitions of the character sets used by unicode functions. + * @author Node.js + * @see https://github.com/nodejs/node/blob/main/src/node_url_tables.cc + */ +#ifndef ADA_CHARACTER_SETS_H +#define ADA_CHARACTER_SETS_H + +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/common_defs.h +/* begin file include/ada/common_defs.h */ +/** + * @file common_defs.h + * @brief Common definitions for cross-platform compiler support. + */ +#ifndef ADA_COMMON_DEFS_H +#define ADA_COMMON_DEFS_H +#ifdef _MSC_VER +#define ADA_VISUAL_STUDIO 1 +/** + * We want to differentiate carefully between + * clang under visual studio and regular visual + * studio. + */ +#ifdef __clang__ +// clang under visual studio +#define ADA_CLANG_VISUAL_STUDIO 1 +#else +// just regular visual studio (best guess) +#define ADA_REGULAR_VISUAL_STUDIO 1 +#endif // __clang__ +#endif // _MSC_VER + + +#if defined(__GNUC__) + // Marks a block with a name so that MCA analysis can see it. + #define ADA_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name); + #define ADA_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name); + #define ADA_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name); +#else + #define ADA_BEGIN_DEBUG_BLOCK(name) + #define ADA_END_DEBUG_BLOCK(name) + #define ADA_DEBUG_BLOCK(name, block) +#endif + +// Align to N-byte boundary +#define ADA_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1)) +#define ADA_ROUNDDOWN_N(a, n) ((a) & ~((n)-1)) + +#define ADA_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0) + +#if defined(ADA_REGULAR_VISUAL_STUDIO) + + #define ada_really_inline __forceinline + #define ada_never_inline __declspec(noinline) + + #define ada_unused + #define ada_warn_unused + + #ifndef ada_likely + #define ada_likely(x) x + #endif + #ifndef ada_unlikely + #define ada_unlikely(x) x + #endif + + #define ADA_PUSH_DISABLE_WARNINGS __pragma(warning( push )) + #define ADA_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 )) + #define ADA_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER )) + // Get rid of Intellisense-only warnings (Code Analysis) + // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910). + #ifdef __has_include + #if __has_include() + #include + #define ADA_DISABLE_UNDESIRED_WARNINGS ADA_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS) + #endif + #endif + + #ifndef ADA_DISABLE_UNDESIRED_WARNINGS + #define ADA_DISABLE_UNDESIRED_WARNINGS + #endif + + #define ADA_DISABLE_DEPRECATED_WARNING ADA_DISABLE_VS_WARNING(4996) + #define ADA_DISABLE_STRICT_OVERFLOW_WARNING + #define ADA_POP_DISABLE_WARNINGS __pragma(warning( pop )) + +#else // ADA_REGULAR_VISUAL_STUDIO + + #define ada_really_inline inline __attribute__((always_inline)) + #define ada_never_inline inline __attribute__((noinline)) + + #define ada_unused __attribute__((unused)) + #define ada_warn_unused __attribute__((warn_unused_result)) + + #ifndef ada_likely + #define ada_likely(x) __builtin_expect(!!(x), 1) + #endif + #ifndef ada_unlikely + #define ada_unlikely(x) __builtin_expect(!!(x), 0) + #endif + + #define ADA_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push") + // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary + #define ADA_PUSH_DISABLE_ALL_WARNINGS ADA_PUSH_DISABLE_WARNINGS \ + ADA_DISABLE_GCC_WARNING(-Weffc++) \ + ADA_DISABLE_GCC_WARNING(-Wall) \ + ADA_DISABLE_GCC_WARNING(-Wconversion) \ + ADA_DISABLE_GCC_WARNING(-Wextra) \ + ADA_DISABLE_GCC_WARNING(-Wattributes) \ + ADA_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \ + ADA_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \ + ADA_DISABLE_GCC_WARNING(-Wreturn-type) \ + ADA_DISABLE_GCC_WARNING(-Wshadow) \ + ADA_DISABLE_GCC_WARNING(-Wunused-parameter) \ + ADA_DISABLE_GCC_WARNING(-Wunused-variable) + #define ADA_PRAGMA(P) _Pragma(#P) + #define ADA_DISABLE_GCC_WARNING(WARNING) ADA_PRAGMA(GCC diagnostic ignored #WARNING) + #if defined(ADA_CLANG_VISUAL_STUDIO) + #define ADA_DISABLE_UNDESIRED_WARNINGS ADA_DISABLE_GCC_WARNING(-Wmicrosoft-include) + #else + #define ADA_DISABLE_UNDESIRED_WARNINGS + #endif + #define ADA_DISABLE_DEPRECATED_WARNING ADA_DISABLE_GCC_WARNING(-Wdeprecated-declarations) + #define ADA_DISABLE_STRICT_OVERFLOW_WARNING ADA_DISABLE_GCC_WARNING(-Wstrict-overflow) + #define ADA_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop") + +#endif // MSC_VER + +#if defined(ADA_VISUAL_STUDIO) + /** + * It does not matter here whether you are using + * the regular visual studio or clang under visual + * studio. + */ + #if ADA_USING_LIBRARY + #define ADA_DLLIMPORTEXPORT __declspec(dllimport) + #else + #define ADA_DLLIMPORTEXPORT __declspec(dllexport) + #endif +#else + #define ADA_DLLIMPORTEXPORT +#endif + +/// If EXPR is an error, returns it. +#define ADA_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } + +// __has_cpp_attribute is part of C++20 +#if !defined(__has_cpp_attribute) +#define __has_cpp_attribute(x) 0 +#endif + + +#if __has_cpp_attribute(gnu::noinline) +#define ADA_ATTRIBUTE_NOINLINE [[gnu::noinline]] +#else +#define ADA_ATTRIBUTE_NOINLINE +#endif + +namespace ada { + [[noreturn]] inline void unreachable() { +#ifdef __GNUC__ + __builtin_unreachable(); +#elif defined(_MSC_VER) + __assume(false); +#else +#endif + } +} + + + +#if defined(__GNUC__) && !defined(__clang__) +#if __GNUC__ <= 8 +#define ADA_OLD_GCC 1 +#endif // __GNUC__ <= 8 +#endif // defined(__GNUC__) && !defined(__clang__) + +#if ADA_OLD_GCC +#define ada_constexpr +#else +#define ada_constexpr constexpr +#endif + +#endif // ADA_COMMON_DEFS_H + + #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) + #define ADA_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + #elif defined(_WIN32) + #define ADA_IS_BIG_ENDIAN 0 + #else + #if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ + #include + #elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__) + #include + #else // defined(__APPLE__) || defined(__FreeBSD__) + + #ifdef __has_include + #if __has_include() + #include + #endif //__has_include() + #endif //__has_include + + #endif // defined(__APPLE__) || defined(__FreeBSD__) + + + #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) + #define ADA_IS_BIG_ENDIAN 0 + #endif + + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #define ADA_IS_BIG_ENDIAN 0 + #else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #define ADA_IS_BIG_ENDIAN 1 + #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + + #endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ +/* end file include/ada/common_defs.h */ +#include + +namespace ada::character_sets { + + constexpr char hex[1024] = + "%00\0%01\0%02\0%03\0%04\0%05\0%06\0%07\0" + "%08\0%09\0%0A\0%0B\0%0C\0%0D\0%0E\0%0F\0" + "%10\0%11\0%12\0%13\0%14\0%15\0%16\0%17\0" + "%18\0%19\0%1A\0%1B\0%1C\0%1D\0%1E\0%1F\0" + "%20\0%21\0%22\0%23\0%24\0%25\0%26\0%27\0" + "%28\0%29\0%2A\0%2B\0%2C\0%2D\0%2E\0%2F\0" + "%30\0%31\0%32\0%33\0%34\0%35\0%36\0%37\0" + "%38\0%39\0%3A\0%3B\0%3C\0%3D\0%3E\0%3F\0" + "%40\0%41\0%42\0%43\0%44\0%45\0%46\0%47\0" + "%48\0%49\0%4A\0%4B\0%4C\0%4D\0%4E\0%4F\0" + "%50\0%51\0%52\0%53\0%54\0%55\0%56\0%57\0" + "%58\0%59\0%5A\0%5B\0%5C\0%5D\0%5E\0%5F\0" + "%60\0%61\0%62\0%63\0%64\0%65\0%66\0%67\0" + "%68\0%69\0%6A\0%6B\0%6C\0%6D\0%6E\0%6F\0" + "%70\0%71\0%72\0%73\0%74\0%75\0%76\0%77\0" + "%78\0%79\0%7A\0%7B\0%7C\0%7D\0%7E\0%7F\0" + "%80\0%81\0%82\0%83\0%84\0%85\0%86\0%87\0" + "%88\0%89\0%8A\0%8B\0%8C\0%8D\0%8E\0%8F\0" + "%90\0%91\0%92\0%93\0%94\0%95\0%96\0%97\0" + "%98\0%99\0%9A\0%9B\0%9C\0%9D\0%9E\0%9F\0" + "%A0\0%A1\0%A2\0%A3\0%A4\0%A5\0%A6\0%A7\0" + "%A8\0%A9\0%AA\0%AB\0%AC\0%AD\0%AE\0%AF\0" + "%B0\0%B1\0%B2\0%B3\0%B4\0%B5\0%B6\0%B7\0" + "%B8\0%B9\0%BA\0%BB\0%BC\0%BD\0%BE\0%BF\0" + "%C0\0%C1\0%C2\0%C3\0%C4\0%C5\0%C6\0%C7\0" + "%C8\0%C9\0%CA\0%CB\0%CC\0%CD\0%CE\0%CF\0" + "%D0\0%D1\0%D2\0%D3\0%D4\0%D5\0%D6\0%D7\0" + "%D8\0%D9\0%DA\0%DB\0%DC\0%DD\0%DE\0%DF\0" + "%E0\0%E1\0%E2\0%E3\0%E4\0%E5\0%E6\0%E7\0" + "%E8\0%E9\0%EA\0%EB\0%EC\0%ED\0%EE\0%EF\0" + "%F0\0%F1\0%F2\0%F3\0%F4\0%F5\0%F6\0%F7\0" + "%F8\0%F9\0%FA\0%FB\0%FC\0%FD\0%FE\0%FF"; + + constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32] = { + // 00 01 02 03 04 05 06 07 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 08 09 0A 0B 0C 0D 0E 0F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 10 11 12 13 14 15 16 17 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 18 19 1A 1B 1C 1D 1E 1F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 20 21 22 23 24 25 26 27 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 28 29 2A 2B 2C 2D 2E 2F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 30 31 32 33 34 35 36 37 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 38 39 3A 3B 3C 3D 3E 3F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 40 41 42 43 44 45 46 47 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 48 49 4A 4B 4C 4D 4E 4F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 50 51 52 53 54 55 56 57 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 58 59 5A 5B 5C 5D 5E 5F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 60 61 62 63 64 65 66 67 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 68 69 6A 6B 6C 6D 6E 6F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 70 71 72 73 74 75 76 77 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 78 79 7A 7B 7C 7D 7E 7F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, + // 80 81 82 83 84 85 86 87 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 88 89 8A 8B 8C 8D 8E 8F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 90 91 92 93 94 95 96 97 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 98 99 9A 9B 9C 9D 9E 9F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A0 A1 A2 A3 A4 A5 A6 A7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A8 A9 AA AB AC AD AE AF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B0 B1 B2 B3 B4 B5 B6 B7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B8 B9 BA BB BC BD BE BF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C0 C1 C2 C3 C4 C5 C6 C7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C8 C9 CA CB CC CD CE CF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D0 D1 D2 D3 D4 D5 D6 D7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D8 D9 DA DB DC DD DE DF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E0 E1 E2 E3 E4 E5 E6 E7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E8 E9 EA EB EC ED EE EF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F0 F1 F2 F3 F4 F5 F6 F7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F8 F9 FA FB FC FD FE FF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 + }; + + constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32] = { + // 00 01 02 03 04 05 06 07 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 08 09 0A 0B 0C 0D 0E 0F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 10 11 12 13 14 15 16 17 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 18 19 1A 1B 1C 1D 1E 1F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 20 21 22 23 24 25 26 27 + 0x01 | 0x00 | 0x04 | 0x08 | 0x00 | 0x00 | 0x00 | 0x80, + // 28 29 2A 2B 2C 2D 2E 2F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 30 31 32 33 34 35 36 37 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 38 39 3A 3B 3C 3D 3E 3F + 0x00 | 0x00 | 0x00 | 0x00 | 0x10 | 0x00 | 0x40 | 0x00, + // 40 41 42 43 44 45 46 47 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 48 49 4A 4B 4C 4D 4E 4F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 50 51 52 53 54 55 56 57 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 58 59 5A 5B 5C 5D 5E 5F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 60 61 62 63 64 65 66 67 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 68 69 6A 6B 6C 6D 6E 6F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 70 71 72 73 74 75 76 77 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 78 79 7A 7B 7C 7D 7E 7F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, + // 80 81 82 83 84 85 86 87 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 88 89 8A 8B 8C 8D 8E 8F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 90 91 92 93 94 95 96 97 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 98 99 9A 9B 9C 9D 9E 9F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A0 A1 A2 A3 A4 A5 A6 A7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A8 A9 AA AB AC AD AE AF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B0 B1 B2 B3 B4 B5 B6 B7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B8 B9 BA BB BC BD BE BF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C0 C1 C2 C3 C4 C5 C6 C7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C8 C9 CA CB CC CD CE CF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D0 D1 D2 D3 D4 D5 D6 D7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D8 D9 DA DB DC DD DE DF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E0 E1 E2 E3 E4 E5 E6 E7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E8 E9 EA EB EC ED EE EF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F0 F1 F2 F3 F4 F5 F6 F7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F8 F9 FA FB FC FD FE FF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 + }; + + constexpr uint8_t QUERY_PERCENT_ENCODE[32] = { + // 00 01 02 03 04 05 06 07 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 08 09 0A 0B 0C 0D 0E 0F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 10 11 12 13 14 15 16 17 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 18 19 1A 1B 1C 1D 1E 1F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 20 21 22 23 24 25 26 27 + 0x01 | 0x00 | 0x04 | 0x08 | 0x00 | 0x00 | 0x00 | 0x00, + // 28 29 2A 2B 2C 2D 2E 2F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 30 31 32 33 34 35 36 37 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 38 39 3A 3B 3C 3D 3E 3F + 0x00 | 0x00 | 0x00 | 0x00 | 0x10 | 0x00 | 0x40 | 0x00, + // 40 41 42 43 44 45 46 47 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 48 49 4A 4B 4C 4D 4E 4F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 50 51 52 53 54 55 56 57 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 58 59 5A 5B 5C 5D 5E 5F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 60 61 62 63 64 65 66 67 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 68 69 6A 6B 6C 6D 6E 6F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 70 71 72 73 74 75 76 77 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 78 79 7A 7B 7C 7D 7E 7F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, + // 80 81 82 83 84 85 86 87 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 88 89 8A 8B 8C 8D 8E 8F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 90 91 92 93 94 95 96 97 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 98 99 9A 9B 9C 9D 9E 9F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A0 A1 A2 A3 A4 A5 A6 A7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A8 A9 AA AB AC AD AE AF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B0 B1 B2 B3 B4 B5 B6 B7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B8 B9 BA BB BC BD BE BF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C0 C1 C2 C3 C4 C5 C6 C7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C8 C9 CA CB CC CD CE CF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D0 D1 D2 D3 D4 D5 D6 D7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D8 D9 DA DB DC DD DE DF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E0 E1 E2 E3 E4 E5 E6 E7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E8 E9 EA EB EC ED EE EF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F0 F1 F2 F3 F4 F5 F6 F7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F8 F9 FA FB FC FD FE FF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 + }; + + constexpr uint8_t FRAGMENT_PERCENT_ENCODE[32] = { + // 00 01 02 03 04 05 06 07 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 08 09 0A 0B 0C 0D 0E 0F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 10 11 12 13 14 15 16 17 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 18 19 1A 1B 1C 1D 1E 1F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 20 21 22 23 24 25 26 27 + 0x01 | 0x00 | 0x04 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 28 29 2A 2B 2C 2D 2E 2F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 30 31 32 33 34 35 36 37 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 38 39 3A 3B 3C 3D 3E 3F + 0x00 | 0x00 | 0x00 | 0x00 | 0x10 | 0x00 | 0x40 | 0x00, + // 40 41 42 43 44 45 46 47 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 48 49 4A 4B 4C 4D 4E 4F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 50 51 52 53 54 55 56 57 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 58 59 5A 5B 5C 5D 5E 5F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 60 61 62 63 64 65 66 67 + 0x01 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 68 69 6A 6B 6C 6D 6E 6F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 70 71 72 73 74 75 76 77 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 78 79 7A 7B 7C 7D 7E 7F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, + // 80 81 82 83 84 85 86 87 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 88 89 8A 8B 8C 8D 8E 8F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 90 91 92 93 94 95 96 97 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 98 99 9A 9B 9C 9D 9E 9F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A0 A1 A2 A3 A4 A5 A6 A7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A8 A9 AA AB AC AD AE AF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B0 B1 B2 B3 B4 B5 B6 B7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B8 B9 BA BB BC BD BE BF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C0 C1 C2 C3 C4 C5 C6 C7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C8 C9 CA CB CC CD CE CF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D0 D1 D2 D3 D4 D5 D6 D7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D8 D9 DA DB DC DD DE DF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E0 E1 E2 E3 E4 E5 E6 E7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E8 E9 EA EB EC ED EE EF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F0 F1 F2 F3 F4 F5 F6 F7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F8 F9 FA FB FC FD FE FF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 + }; + + constexpr uint8_t USERINFO_PERCENT_ENCODE[32] = { + // 00 01 02 03 04 05 06 07 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 08 09 0A 0B 0C 0D 0E 0F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 10 11 12 13 14 15 16 17 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 18 19 1A 1B 1C 1D 1E 1F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 20 21 22 23 24 25 26 27 + 0x01 | 0x00 | 0x04 | 0x08 | 0x00 | 0x00 | 0x00 | 0x00, + // 28 29 2A 2B 2C 2D 2E 2F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, + // 30 31 32 33 34 35 36 37 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 38 39 3A 3B 3C 3D 3E 3F + 0x00 | 0x00 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 40 41 42 43 44 45 46 47 + 0x01 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 48 49 4A 4B 4C 4D 4E 4F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 50 51 52 53 54 55 56 57 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 58 59 5A 5B 5C 5D 5E 5F + 0x00 | 0x00 | 0x00 | 0x08 | 0x10 | 0x20 | 0x40 | 0x00, + // 60 61 62 63 64 65 66 67 + 0x01 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 68 69 6A 6B 6C 6D 6E 6F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 70 71 72 73 74 75 76 77 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 78 79 7A 7B 7C 7D 7E 7F + 0x00 | 0x00 | 0x00 | 0x08 | 0x10 | 0x20 | 0x00 | 0x80, + // 80 81 82 83 84 85 86 87 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 88 89 8A 8B 8C 8D 8E 8F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 90 91 92 93 94 95 96 97 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 98 99 9A 9B 9C 9D 9E 9F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A0 A1 A2 A3 A4 A5 A6 A7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A8 A9 AA AB AC AD AE AF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B0 B1 B2 B3 B4 B5 B6 B7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B8 B9 BA BB BC BD BE BF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C0 C1 C2 C3 C4 C5 C6 C7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C8 C9 CA CB CC CD CE CF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D0 D1 D2 D3 D4 D5 D6 D7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D8 D9 DA DB DC DD DE DF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E0 E1 E2 E3 E4 E5 E6 E7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E8 E9 EA EB EC ED EE EF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F0 F1 F2 F3 F4 F5 F6 F7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F8 F9 FA FB FC FD FE FF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 + }; + + constexpr uint8_t PATH_PERCENT_ENCODE[32] = { + // 00 01 02 03 04 05 06 07 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 08 09 0A 0B 0C 0D 0E 0F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 10 11 12 13 14 15 16 17 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 18 19 1A 1B 1C 1D 1E 1F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 20 21 22 23 24 25 26 27 + 0x01 | 0x00 | 0x04 | 0x08 | 0x00 | 0x00 | 0x00 | 0x00, + // 28 29 2A 2B 2C 2D 2E 2F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 30 31 32 33 34 35 36 37 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 38 39 3A 3B 3C 3D 3E 3F + 0x00 | 0x00 | 0x00 | 0x00 | 0x10 | 0x00 | 0x40 | 0x80, + // 40 41 42 43 44 45 46 47 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 48 49 4A 4B 4C 4D 4E 4F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 50 51 52 53 54 55 56 57 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 58 59 5A 5B 5C 5D 5E 5F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 60 61 62 63 64 65 66 67 + 0x01 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 68 69 6A 6B 6C 6D 6E 6F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 70 71 72 73 74 75 76 77 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 78 79 7A 7B 7C 7D 7E 7F + 0x00 | 0x00 | 0x00 | 0x08 | 0x00 | 0x20 | 0x00 | 0x80, + // 80 81 82 83 84 85 86 87 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 88 89 8A 8B 8C 8D 8E 8F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 90 91 92 93 94 95 96 97 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 98 99 9A 9B 9C 9D 9E 9F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A0 A1 A2 A3 A4 A5 A6 A7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A8 A9 AA AB AC AD AE AF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B0 B1 B2 B3 B4 B5 B6 B7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B8 B9 BA BB BC BD BE BF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C0 C1 C2 C3 C4 C5 C6 C7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C8 C9 CA CB CC CD CE CF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D0 D1 D2 D3 D4 D5 D6 D7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D8 D9 DA DB DC DD DE DF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E0 E1 E2 E3 E4 E5 E6 E7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E8 E9 EA EB EC ED EE EF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F0 F1 F2 F3 F4 F5 F6 F7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F8 F9 FA FB FC FD FE FF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 + }; + + ada_really_inline bool bit_at(const uint8_t a[], const uint8_t i) { + return !!(a[i >> 3] & (1 << (i & 7))); + } + +} // namespace ada::character_sets + +#endif // ADA_CHARACTER_SETS_H +/* end file include/ada/character_sets.h */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/checkers.h +/* begin file include/ada/checkers.h */ +/** + * @file checkers.h + * @brief Definitions for URL specific checkers used within Ada. + */ +#ifndef ADA_CHECKERS_H +#define ADA_CHECKERS_H + + +#include +#include + +namespace ada::checkers { + + // + // More likely to be inlined by the compiler and constexpr. + /** + * Assuming that x is an ASCII letter, this function returns the lower case equivalent. + * @details More likely to be inlined by the compiler and constexpr. + */ + constexpr char to_lower(char x) noexcept { return (x | 0x20); } + + /** + * Returns true if the character is an ASCII letter. Equivalent to std::isalpha but + * more likely to be inlined by the compiler. + * + * @attention std::isalpha is not constexpr generally. + */ + constexpr bool is_alpha(char x) noexcept { return (to_lower(x) >= 'a') && (to_lower(x) <= 'z'); } + + /** + * Check whether a string starts with 0x or 0X. The function is only + * safe if input.size() >=2. + * + * @see has_hex_prefix + */ + inline bool has_hex_prefix_unsafe(std::string_view input) { + // This is actualy efficient code, see has_hex_prefix for the assembly. + uint32_t value_one = 1; + bool is_little_endian = (reinterpret_cast(&value_one)[0] == 1); + uint16_t word0x{}; + std::memcpy(&word0x, "0x", 2); // we would use bit_cast in C++20 and the function could be constexpr. + uint16_t two_first_bytes{}; + std::memcpy(&two_first_bytes, input.data(),2); + if(is_little_endian) { two_first_bytes |= 0x2000; } else { two_first_bytes |= 0x020; } + return two_first_bytes == word0x; + } + + /** + * Check whether a string starts with 0x or 0X. + */ + inline bool has_hex_prefix(std::string_view input) { + return input.size() >=2 && has_hex_prefix_unsafe(input); + } + + /** + * Check whether x is an ASCII digit. More likely to be inlined than std::isdigit. + */ + constexpr bool is_digit(char x) noexcept { return (x >= '0') & (x <= '9'); } + + /** + * @details A string starts with a Windows drive letter if all of the following are true: + * + * - its length is greater than or equal to 2 + * - its first two code points are a Windows drive letter + * - its length is 2 or its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#). + * + * https://url.spec.whatwg.org/#start-with-a-windows-drive-letter + */ + inline constexpr bool is_windows_drive_letter(std::string_view input) noexcept { + return input.size() >= 2 && (is_alpha(input[0]) && ((input[1] == ':') || (input[1] == '|'))) + && ((input.size() == 2) || (input[2] == '/' || input[2] == '\\' || input[2] == '?' || input[2] == '#')); + } + + /** + * @details A normalized Windows drive letter is a Windows drive letter of which the second code point is U+003A (:). + */ + inline constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept { + return input.size() >= 2 && (is_alpha(input[0]) && (input[1] == ':')); + } + + /** + * @warning Will be removed when Ada supports C++20. + */ + ada_really_inline constexpr bool begins_with(std::string_view view, std::string_view prefix) { + // in C++20, you have view.begins_with(prefix) + return view.size() >= prefix.size() && (view.substr(0, prefix.size()) == prefix); + } + + /** + * Returns true if an input is an ipv4 address. + */ + ada_really_inline ada_constexpr bool is_ipv4(std::string_view view) noexcept; + + /** + * Returns a bitset. If the first bit is set, then at least one character needs + * percent encoding. If the second bit is set, a \\ is found. If the third bit is set + * then we have a dot. If the fourth bit is set, then we have a percent character. + */ + ada_really_inline constexpr uint8_t path_signature(std::string_view input) noexcept; + + + /** + * Returns true if the length of the domain name and its labels are according to the specifications. + * The length of the domain must be 255 octets (253 characters not including the last 2 which are the empty + * label reserved at the end). When the empty label is included (a dot at the end), the domain name can have + * 254 characters. The length of a label must be at least 1 and at most 63 characters. + * @see section 3.1. of https://www.rfc-editor.org/rfc/rfc1034 + * @see https://www.unicode.org/reports/tr46/#ToASCII + */ + ada_really_inline constexpr bool verify_dns_length(std::string_view input) noexcept; + +} // namespace ada::checkers + +#endif //ADA_CHECKERS_H +/* end file include/ada/checkers.h */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/log.h +/* begin file include/ada/log.h */ + +/** + * @file log.h + * @private + * @brief logging code. + */ +#ifndef ADA_LOG_H +#define ADA_LOG_H + +#include +// To enable logging, set ADA_LOGGING to 1: +#ifndef ADA_LOGGING +#define ADA_LOGGING 0 +#endif + +namespace ada { + +/** + * Private function used for logging messages. + * @private + */ +template +ada_really_inline void inner_log([[maybe_unused]] T t) { +#if ADA_LOGGING + std::cout << t << std::endl; +#endif +} + + +/** + * Private function used for logging messages. + * @private + */ +template +ada_really_inline void inner_log([[maybe_unused]] T t, [[maybe_unused]] Args... args) { +#if ADA_LOGGING + std::cout << t; + inner_log(args...) ; +#endif +} + + +/** + * Log a message. + * @private + */ +template +ada_really_inline void log([[maybe_unused]] T t, [[maybe_unused]] Args... args) { +#if ADA_LOGGING + std::cout << "ADA_LOG: " << t; + inner_log(args...) ; +#endif +} + +/** + * Log a message. + * @private + */ +template +ada_really_inline void log([[maybe_unused]] T t) { +#if ADA_LOGGING + std::cout << "ADA_LOG: " << t << std::endl; +#endif + +} +} + +#if ADA_LOGGING + +#ifndef ada_log +#define ada_log(...) do { \ + ada::log(__VA_ARGS__); \ +} while(0) +#endif // ada_log +#else +#define ada_log(...) +#endif // ADA_LOGGING + +#endif // ADA_LOG_H +/* end file include/ada/log.h */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/encoding_type.h +/* begin file include/ada/encoding_type.h */ +/** + * @file encoding_type.h + * @brief Definition for supported encoding types. + */ +#ifndef ADA_ENCODING_TYPE_H +#define ADA_ENCODING_TYPE_H + +#include + +namespace ada { + + /** + * This specification defines three encodings with the same names as encoding schemes defined + * in the Unicode standard: UTF-8, UTF-16LE, and UTF-16BE. + * + * @see https://encoding.spec.whatwg.org/#encodings + */ + enum class encoding_type { + UTF8, + UTF_16LE, + UTF_16BE, + }; + + /** + * Convert a encoding_type to string. + */ + ada_warn_unused std::string to_string(encoding_type type); + +} // ada namespace + +#endif // ADA_ENCODING_TYPE_H +/* end file include/ada/encoding_type.h */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/helpers.h +/* begin file include/ada/helpers.h */ +/** + * @file helpers.h + * @brief Definitions for helper functions used within Ada. + */ +#ifndef ADA_HELPERS_H +#define ADA_HELPERS_H + +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/url.h +/* begin file include/ada/url.h */ +/** + * @file url.h + * @brief Definitions for the URL + */ +#ifndef ADA_URL_H +#define ADA_URL_H + +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/scheme.h +/* begin file include/ada/scheme.h */ +/** + * @file scheme.h + * @brief Definitions for the URL scheme. + */ +#ifndef ADA_SCHEME_H +#define ADA_SCHEME_H + + +#include +#include +#include + +namespace ada::scheme { + /** + * Type of the scheme as an enum. + * Using strings to represent a scheme type is not ideal because + * checking for types involves string comparisons. It is faster to use + * a simple integer. + */ + enum type { + HTTP = 0, + NOT_SPECIAL = 1, + HTTPS = 2, + WS = 3, + FTP = 4, + WSS = 5, + FILE = 6 + }; + + namespace details { + // for use with is_special and get_special_port + // Spaces, if present, are removed from URL. + constexpr std::string_view is_special_list[] = {"http", " ", "https", + "ws", "ftp", "wss", "file", " "}; + // for use with get_special_port + constexpr uint16_t special_ports[] = {80, 0, 443, 80, 21, 443, 0, 0}; + } + + /** + * A special scheme is an ASCII string that is listed in the first column of the following table. + * The default port for a special scheme is listed in the second column on the same row. + * The default port for any other ASCII string is null. + * + * @see https://url.spec.whatwg.org/#url-miscellaneous + * @param scheme + * @return If scheme is a special scheme + */ + ada_really_inline constexpr bool is_special(std::string_view scheme) { + if(scheme.empty()) { return false; } + int hash_value = (2*scheme.size() + (unsigned)(scheme[0])) & 7; + const std::string_view target = details::is_special_list[hash_value]; + return (target[0] == scheme[0]) && (target.substr(1) == scheme.substr(1)); + } + + /** + * A special scheme is an ASCII string that is listed in the first column of the following table. + * The default port for a special scheme is listed in the second column on the same row. + * The default port for any other ASCII string is null. + * + * @see https://url.spec.whatwg.org/#url-miscellaneous + * @param scheme + * @return The special port + */ + constexpr uint16_t get_special_port(std::string_view scheme) noexcept { + if(scheme.empty()) { return 0; } + int hash_value = (2*scheme.size() + (unsigned)(scheme[0])) & 7; + const std::string_view target = details::is_special_list[hash_value]; + if ((target[0] == scheme[0]) && (target.substr(1) == scheme.substr(1))) { + return details::special_ports[hash_value]; + } else { return 0; } + } + + /** + * Returns the port number of a special scheme. + * @see https://url.spec.whatwg.org/#special-scheme + */ + constexpr uint16_t get_special_port(ada::scheme::type type) noexcept { + return details::special_ports[int(type)]; + } + + /** + * Returns the scheme of an input, or NOT_SPECIAL if it's not a special scheme defined by the spec. + */ + constexpr ada::scheme::type get_scheme_type(std::string_view scheme) noexcept { + if(scheme.empty()) { return ada::scheme::NOT_SPECIAL; } + int hash_value = (2*scheme.size() + (unsigned)(scheme[0])) & 7; + const std::string_view target = details::is_special_list[hash_value]; + if ((target[0] == scheme[0]) && (target.substr(1) == scheme.substr(1))) { + return ada::scheme::type(hash_value); + } else { return ada::scheme::NOT_SPECIAL; } + } + +} // namespace ada::serializers + +#endif // ADA_SCHEME_H +/* end file include/ada/scheme.h */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/serializers.h +/* begin file include/ada/serializers.h */ +/** + * @file serializers.h + * @brief Definitions for the URL serializers. + */ +#ifndef ADA_SERIALIZERS_H +#define ADA_SERIALIZERS_H + + +#include +#include +#include + +namespace ada::serializers { + + /** + * Finds and returns the longest sequence of 0 values in a ipv6 input. + * + * @returns -1 if not found. + */ + size_t find_longest_sequence_of_ipv6_pieces(const std::array& address) noexcept; + + /** + * Serializes an ipv6 address. + * @details An IPv6 address is a 128-bit unsigned integer that identifies a network address. + * @see https://url.spec.whatwg.org/#concept-ipv6-serializer + */ + std::string ipv6(const std::array& address) noexcept; + + /** + * Serializes an ipv4 address. + * @details An IPv4 address is a 32-bit unsigned integer that identifies a network address. + * @see https://url.spec.whatwg.org/#concept-ipv4-serializer + */ + std::string ipv4(const uint64_t address) noexcept; + +} // namespace ada::serializers + +#endif // ADA_SERIALIZERS_H +/* end file include/ada/serializers.h */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/unicode.h +/* begin file include/ada/unicode.h */ +/** + * @file unicode.h + * @brief Definitions for all unicode specific functions. + */ +#ifndef ADA_UNICODE_H +#define ADA_UNICODE_H + +#include +#include + +namespace ada::unicode { + + /** + * We receive a UTF-8 string representing a domain name. + * If the string is percent encoded, we apply percent decoding. + * + * Given a domain, we need to identify its labels. + * They are separated by label-separators: + * + * U+002E ( . ) FULL STOP + * U+FF0E ( . ) FULLWIDTH FULL STOP + * U+3002 ( 。 ) IDEOGRAPHIC FULL STOP + * U+FF61 ( 。 ) HALFWIDTH IDEOGRAPHIC FULL STOP + * + * They are all mapped to U+002E. + * + * We process each label into a string that should not exceed 63 octets. + * If the string is already punycode (starts with "xn--"), then we must + * scan it to look for unallowed code points. + * Otherwise, if the string is not pure ASCII, we need to transcode it + * to punycode by following RFC 3454 which requires us to + * - Map characters (see section 3), + * - Normalize (see section 4), + * - Reject forbidden characters, + * - Check for right-to-left characters and if so, check all requirements (see section 6), + * - Optionally reject based on unassigned code points (section 7). + * + * The Unicode standard provides a table of code points with a mapping, a list of + * forbidden code points and so forth. This table is subject to change and will + * vary based on the implementation. For Unicode 15, the table is at + * https://www.unicode.org/Public/idna/15.0.0/IdnaMappingTable.txt + * If you use ICU, they parse this table and map it to code using a Python script. + * + * The resulting strings should not exceed 255 octets according to RFC 1035 section 2.3.4. + * ICU checks for label size and domain size, but if we pass "be_strict = false", these + * errors are ignored. + * + * @see https://url.spec.whatwg.org/#concept-domain-to-ascii + * + */ + bool to_ascii(std::optional& out, std::string_view plain, bool be_strict, size_t first_percent); + + /** + * Checks if the input has tab or newline characters. + * + * @attention The has_tabs_or_newline function is a bottleneck and it is simple enough that compilers + * like GCC can 'autovectorize it'. + */ + ada_really_inline constexpr bool has_tabs_or_newline(std::string_view user_input) noexcept; + + /** + * Checks if the input is a forbidden host code point. + * @see https://url.spec.whatwg.org/#forbidden-host-code-point + */ + ada_really_inline constexpr bool is_forbidden_host_code_point(const char c) noexcept; + + /** + * Checks if the input is a forbidden doamin code point. + * @see https://url.spec.whatwg.org/#forbidden-domain-code-point + */ + ada_really_inline constexpr bool is_forbidden_domain_code_point(const char c) noexcept; + + /** + * Checks if the input is alphanumeric, '+', '-' or '.' + */ + ada_really_inline constexpr bool is_alnum_plus(const char c) noexcept; + + /** + * @details An ASCII hex digit is an ASCII upper hex digit or ASCII lower hex digit. + * An ASCII upper hex digit is an ASCII digit or a code point in the range U+0041 (A) to U+0046 (F), inclusive. + * An ASCII lower hex digit is an ASCII digit or a code point in the range U+0061 (a) to U+0066 (f), inclusive. + */ + ada_really_inline constexpr bool is_ascii_hex_digit(const char c) noexcept; + + /** + * Checks if the input is a C0 control or space character. + * + * @details A C0 control or space is a C0 control or U+0020 SPACE. + * A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION SEPARATOR ONE, inclusive. + */ + ada_really_inline constexpr bool is_c0_control_or_space(const char c) noexcept; + + /** + * Checks if the input is a ASCII tab or newline character. + * + * @details An ASCII tab or newline is U+0009 TAB, U+000A LF, or U+000D CR. + */ + ada_really_inline constexpr bool is_ascii_tab_or_newline(const char c) noexcept; + + /** + * @details A double-dot path segment must be ".." or an ASCII case-insensitive match for ".%2e", "%2e.", or "%2e%2e". + */ + ada_really_inline ada_constexpr bool is_double_dot_path_segment(const std::string_view input) noexcept; + + /** + * @details A single-dot path segment must be "." or an ASCII case-insensitive match for "%2e". + */ + ada_really_inline constexpr bool is_single_dot_path_segment(const std::string_view input) noexcept; + + /** + * @details ipv4 character might contain 0-9 or a-f character ranges. + */ + ada_really_inline constexpr bool is_lowercase_hex(const char c) noexcept; + + unsigned constexpr convert_hex_to_binary(char c) noexcept; + + /** + * first_percent should be = input.find('%') + * + * @todo It would be faster as noexcept maybe, but it could be unsafe since. + * @author Node.js + * @see https://github.com/nodejs/node/blob/main/src/node_url.cc#L245 + * @see https://encoding.spec.whatwg.org/#utf-8-decode-without-bom + */ + std::string percent_decode(const std::string_view input, size_t first_percent); + + /** + * Returns a percent-encoding string whether percent encoding was needed or not. + * @see https://github.com/nodejs/node/blob/main/src/node_url.cc#L226 + */ + std::string percent_encode(const std::string_view input, const uint8_t character_set[]); + + /** + * Returns true if percent encoding was needed, in which case, we store + * the percent-encoded content in 'out'. Otherwise, out is left unchanged. + * @see https://github.com/nodejs/node/blob/main/src/node_url.cc#L226 + */ + bool percent_encode(const std::string_view input, const uint8_t character_set[], std::string& out); + +} // namespace ada::unicode + +#endif // ADA_UNICODE_H +/* end file include/ada/unicode.h */ + +#include +#include +#include +#include +#include +#include + +namespace ada { + /** + * @brief A URL is a struct that represents a universal identifier. + * @details To disambiguate from a valid URL string it can also be referred to as a URL record. + * + * @see https://url.spec.whatwg.org/#url-representation + */ + struct url { + /** + * @private + * A URL’s username is an ASCII string identifying a username. It is initially the empty string. + */ + std::string username{}; + + /** + * @private + * A URL’s password is an ASCII string identifying a password. It is initially the empty string. + */ + std::string password{}; + + /** + * @private + * A URL’s host is null or a host. It is initially null. + */ + std::optional host{}; + + /** + * @private + * A URL’s port is either null or a 16-bit unsigned integer that identifies a networking port. It is initially null. + */ + std::optional port{}; + + /** + * @private + * A URL’s path is either an ASCII string or a list of zero or more ASCII strings, usually identifying a location. + */ + std::string path{}; + + /** + * @private + * A URL’s query is either null or an ASCII string. It is initially null. + */ + std::optional query{}; + + /** + * @private + * A URL’s fragment is either null or an ASCII string that can be used for further processing on the resource + * the URL’s other components identify. It is initially null. + */ + std::optional fragment{}; + + /** + * @see https://url.spec.whatwg.org/#dom-url-href + * @see https://url.spec.whatwg.org/#concept-url-serializer + */ + [[nodiscard]] std::string get_href() const noexcept; + + /** + * The origin getter steps are to return the serialization of this’s URL’s origin. [HTML] + * @see https://url.spec.whatwg.org/#concept-url-origin + */ + [[nodiscard]] std::string get_origin() const noexcept; + + /** + * The protocol getter steps are to return this’s URL’s scheme, followed by U+003A (:). + * @see https://url.spec.whatwg.org/#dom-url-protocol + */ + [[nodiscard]] std::string get_protocol() const noexcept; + + /** + * Return url’s host, serialized, followed by U+003A (:) and url’s port, serialized. + * @see https://url.spec.whatwg.org/#dom-url-host + */ + [[nodiscard]] std::string get_host() const noexcept; + + /** + * Return this’s URL’s host, serialized. + * @see https://url.spec.whatwg.org/#dom-url-hostname + */ + [[nodiscard]] std::string get_hostname() const noexcept; + + /** + * The pathname getter steps are to return the result of URL path serializing this’s URL. + * @see https://url.spec.whatwg.org/#dom-url-pathname + */ + [[nodiscard]] std::string get_pathname() const noexcept; + + /** + * Return U+003F (?), followed by this’s URL’s query. + * @see https://url.spec.whatwg.org/#dom-url-search + */ + [[nodiscard]] std::string get_search() const noexcept; + + /** + * The username getter steps are to return this’s URL’s username. + * @see https://url.spec.whatwg.org/#dom-url-username + */ + [[nodiscard]] std::string get_username() const noexcept; + + /** + * @return Returns true on successful operation. + * @see https://url.spec.whatwg.org/#dom-url-username + */ + bool set_username(const std::string_view input); + + /** + * @return Returns true on success. + * @see https://url.spec.whatwg.org/#dom-url-password + */ + bool set_password(const std::string_view input); + + /** + * @return Returns true on success. + * @see https://url.spec.whatwg.org/#dom-url-port + */ + bool set_port(const std::string_view input); + + /** + * This function always succeeds. + * @see https://url.spec.whatwg.org/#dom-url-hash + */ + void set_hash(const std::string_view input); + + /** + * This function always succeeds. + * @see https://url.spec.whatwg.org/#dom-url-search + */ + void set_search(const std::string_view input); + + /** + * @return Returns true on success. + * @see https://url.spec.whatwg.org/#dom-url-search + */ + bool set_pathname(const std::string_view input); + + /** + * @return Returns true on success. + * @see https://url.spec.whatwg.org/#dom-url-host + */ + bool set_host(const std::string_view input); + + /** + * @return Returns true on success. + * @see https://url.spec.whatwg.org/#dom-url-hostname + */ + bool set_hostname(const std::string_view input); + + /** + * @return Returns true on success. + * @see https://url.spec.whatwg.org/#dom-url-protocol + */ + bool set_protocol(const std::string_view input); + + /** + * @see https://url.spec.whatwg.org/#dom-url-href + */ + bool set_href(const std::string_view input); + + /** + * The password getter steps are to return this’s URL’s password. + * @see https://url.spec.whatwg.org/#dom-url-password + */ + [[nodiscard]] std::string get_password() const noexcept; + + /** + * Return this’s URL’s port, serialized. + * @see https://url.spec.whatwg.org/#dom-url-port + */ + [[nodiscard]] std::string get_port() const noexcept; + + /** + * Return U+0023 (#), followed by this’s URL’s fragment. + * @see https://url.spec.whatwg.org/#dom-url-hash + */ + [[nodiscard]] std::string get_hash() const noexcept; + + /** + * Returns true if this URL has a valid domain as per RFC 1034 and + * corresponding specifications. Among other things, it requires + * that the domain string has fewer than 255 octets. + */ + [[nodiscard]] bool has_valid_domain() const noexcept; + + /** + * Used for returning the validity from the result of the URL parser. + */ + bool is_valid{true}; + + /** + * A URL has an opaque path if its path is a string. + */ + bool has_opaque_path{false}; + + /** + * A URL includes credentials if its username or password is not the empty string. + */ + [[nodiscard]] ada_really_inline bool includes_credentials() const noexcept { + return !username.empty() || !password.empty(); + } + + /** + * A URL is special if its scheme is a special scheme. A URL is not special if its scheme is not a special scheme. + */ + [[nodiscard]] ada_really_inline bool is_special() const noexcept { + return type != ada::scheme::NOT_SPECIAL; + } + + /** + * @private + * + * Return the 'special port' if the URL is special and not 'file'. + * Returns 0 otherwise. + */ + [[nodiscard]] uint16_t get_special_port() const { + return ada::scheme::get_special_port(type); + } + + /** + * @private + * + * Return the scheme type. Note that it is faster to do + * get_scheme_type() == ada::scheme::type::FILE than to do + * get_scheme() == "file", since the former is a direct integer comparison, + * while the other involves a (cheap) string test. + */ + [[nodiscard]] ada_really_inline ada::scheme::type get_scheme_type() const noexcept { + return type; + } + + /** + * @private + * + * Get the default port if the url's scheme has one, returns 0 otherwise. + */ + [[nodiscard]] ada_really_inline uint16_t scheme_default_port() const noexcept { + return scheme::get_special_port(type); + } + + /** + * @private + * + * A URL cannot have a username/password/port if its host is null or the empty string, or its scheme is "file". + */ + [[nodiscard]] bool cannot_have_credentials_or_port() const { + return !host.has_value() || host.value().empty() || type == ada::scheme::type::FILE; + } + /** For development purposes, we want to know when a copy is made. */ + url() = default; + url(const url &u) = default; + url(url &&u) noexcept = default; + url &operator=(url &&u) noexcept = default; + url &operator=(const url &u) = default; + ADA_ATTRIBUTE_NOINLINE ~url() = default; + + /** + * @private + * + * Parse a port (16-bit decimal digit) from the provided input. + * We assume that the input does not contain spaces or tabs + * within the ASCII digits. + * It returns how many bytes were consumed when a number is successfully parsed. + * @return On failure, it returns zero. + * @see https://url.spec.whatwg.org/#host-parsing + */ + ada_really_inline size_t parse_port(std::string_view view, bool check_trailing_content = false) noexcept { + ada_log("parse_port('", view, "') ", view.size()); + uint16_t parsed_port{}; + auto r = std::from_chars(view.data(), view.data() + view.size(), parsed_port); + if(r.ec == std::errc::result_out_of_range) { + ada_log("parse_port: std::errc::result_out_of_range"); + is_valid = false; + return 0; + } + ada_log("parse_port: ", parsed_port); + const size_t consumed = size_t(r.ptr - view.data()); + ada_log("parse_port: consumed ", consumed); + if(check_trailing_content) { + is_valid &= (consumed == view.size() || view[consumed] == '/' || view[consumed] == '?' || (is_special() && view[consumed] == '\\')); + } + ada_log("parse_port: is_valid = ", is_valid); + if(is_valid) { + port = (r.ec == std::errc() && scheme_default_port() != parsed_port) ? + std::optional(parsed_port) : std::nullopt; + } + return consumed; + } + + /** + * @private + * + * Return a string representing the scheme. Note that get_scheme_type() should often be used instead. + * @see https://url.spec.whatwg.org/#dom-url-protocol + */ + [[nodiscard]] std::string_view get_scheme() const noexcept { + if(is_special()) { return ada::scheme::details::is_special_list[type]; } + // We only move the 'scheme' if it is non-special. + return non_special_scheme; + } + + /** + * Set the scheme for this URL. The provided scheme should be a valid + * scheme string, be lower-cased, not contain spaces or tabs. It should + * have no spurious trailing or leading content. + */ + void set_scheme(std::string&& new_scheme) noexcept { + type = ada::scheme::get_scheme_type(new_scheme); + // We only move the 'scheme' if it is non-special. + if(!is_special()) { + non_special_scheme = new_scheme; + } + } + + /** + * @private + * + * Take the scheme from another URL. The scheme string is moved from the + * provided url. + */ + void copy_scheme(ada::url&& u) noexcept { + non_special_scheme = u.non_special_scheme; + type = u.type; + } + + /** + * @private + * + * Take the scheme from another URL. The scheme string is copied from the + * provided url. + */ + void copy_scheme(const ada::url& u) { + non_special_scheme = u.non_special_scheme; + type = u.type; + } + + /** + * @private + * + * Parse the host from the provided input. We assume that + * the input does not contain spaces or tabs. Control + * characters and spaces are not trimmed (they should have + * been removed if needed). + * Return true on success. + * @see https://url.spec.whatwg.org/#host-parsing + */ + [[nodiscard]] ada_really_inline bool parse_host(std::string_view input); + + /** + * @private + * + * Parse the path from the provided input. + * Return true on success. Control characters not + * trimmed from the ends (they should have + * been removed if needed). + * + * The input is expected to be UTF-8. + * + * @see https://url.spec.whatwg.org/ + */ + [[nodiscard]] ada_really_inline bool parse_path(const std::string_view input); + + /** + * @private + * + * Parse the path from the provided input. It should have been + * 'prepared' (e.g., it cannot contain tabs and spaces). See + * parse_path. + * + * The input is expected to be UTF-8. + * + * Return true on success. + * @see https://url.spec.whatwg.org/ + */ + [[nodiscard]] ada_really_inline bool parse_prepared_path(const std::string_view input); + + /** + * @private + */ + template + [[nodiscard]] ada_really_inline bool parse_scheme(const std::string_view input); + + /** + * Returns a JSON string representation of this URL. + */ + std::string to_string() const; + + private: + + /** + * @private + * + * Return true on success. + * @see https://url.spec.whatwg.org/#concept-ipv4-parser + */ + [[nodiscard]] bool parse_ipv4(std::string_view input); + + /** + * @private + * + * Return true on success. + * @see https://url.spec.whatwg.org/#concept-ipv6-parser + */ + [[nodiscard]] bool parse_ipv6(std::string_view input); + + /** + * @private + * + * Return true on success. + * @see https://url.spec.whatwg.org/#concept-opaque-host-parser + */ + [[nodiscard]] bool parse_opaque_host(std::string_view input); + + /** + * @private + */ + ada::scheme::type type{ada::scheme::type::NOT_SPECIAL}; + + /** + * @private + * + * A URL’s scheme is an ASCII string that identifies the type of URL and can be used to dispatch a + * URL for further processing after parsing. It is initially the empty string. + * We only set non_special_scheme when the scheme is non-special, otherwise we avoid constructing + * string. + * + * Special schemes are stored in ada::scheme::details::is_special_list so we typically do not need + * to store them in each url instance. + */ + std::string non_special_scheme{}; + + }; // struct url + + + inline std::ostream& operator<<(std::ostream& out, const ada::url& u) { + return out << u.to_string(); + } +} // namespace ada + +#endif // ADA_URL_H +/* end file include/ada/url.h */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/state.h +/* begin file include/ada/state.h */ +/** + * @file state.h + * @brief Definitions for the states of the URL state machine. + */ +#ifndef ADA_STATE_H +#define ADA_STATE_H + + +#include + +namespace ada { + + /** + * @see https://url.spec.whatwg.org/#url-parsing + */ + enum class state { + AUTHORITY, + SCHEME_START, + SCHEME, + HOST, + NO_SCHEME, + FRAGMENT, + RELATIVE_SCHEME, + RELATIVE_SLASH, + FILE, + FILE_HOST, + FILE_SLASH, + PATH_OR_AUTHORITY, + SPECIAL_AUTHORITY_IGNORE_SLASHES, + SPECIAL_AUTHORITY_SLASHES, + SPECIAL_RELATIVE_OR_AUTHORITY, + QUERY, + PATH, + PATH_START, + OPAQUE_PATH, + PORT, + }; + + /** + * Stringify a URL state machine state. + */ + ada_warn_unused std::string to_string(ada::state s); + +} // ada namespace + +#endif // ADA_STATE_H +/* end file include/ada/state.h */ + +#include +#include + +namespace ada::helpers { + + /** + * This function is used to prune a fragment from a url, and returning the removed string if input has fragment. + * + * @details prune_fragment seeks the first '#' and returns everything after it as a + * string_view, and modifies (in place) the input so that it points at everything + * before the '#'. If no '#' is found, the input is left unchanged and std::nullopt is returned. + * + * @attention The function is non-allocating and it does not throw. + * @returns Note that the returned string_view might be empty! + */ + ada_really_inline std::optional prune_fragment(std::string_view& input) noexcept; + + /** + * Defined by the URL specification, shorten a URLs paths. + * @see https://url.spec.whatwg.org/#shorten-a-urls-path + */ + ada_really_inline void shorten_path(ada::url &url) noexcept; + + /** + * Remove and mutate all ASCII tab or newline characters from an input. + */ + ada_really_inline void remove_ascii_tab_or_newline(std::string& input) noexcept; + + /** + * Return the substring from input going from index pos to the end. If pos > input.size(), + * it returns an empty string_view. This function cannot throw. + */ + ada_really_inline std::string_view substring(std::string_view input, size_t pos) noexcept; + + /** + * Returns a host's delimiter location depending on the state of the instance. + * Used by the host parser. + */ + ada_really_inline size_t get_host_delimiter_location(const ada::url& url, std::string_view& view, bool& inside_brackets) noexcept; + + /** + * Removes leading and trailing C0 control and whitespace characters from string. + */ + ada_really_inline void trim_c0_whitespace(std::string_view& input) noexcept; + +} // namespace ada::helpers + +#endif // ADA_HELPERS_H +/* end file include/ada/helpers.h */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/parser.h +/* begin file include/ada/parser.h */ +/** + * @file parser.h + * @brief Definitions for the parser. + */ +#ifndef ADA_PARSER_H +#define ADA_PARSER_H + + +#include +#include + +namespace ada::parser { + + /** + * Parses a url. + */ + url parse_url(std::string_view user_input, + const ada::url* base_url = nullptr, + ada::encoding_type encoding = ada::encoding_type::UTF8); + +} // namespace ada + +#endif // ADA_PARSER_H +/* end file include/ada/parser.h */ + +// Public API +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/ada_version.h +/* begin file include/ada/ada_version.h */ +/** + * @file ada_version.h + * @brief Definitions for Ada's version number. + */ +#ifndef ADA_ADA_VERSION_H +#define ADA_ADA_VERSION_H + +#define ADA_VERSION "0.1.0" + +namespace ada { + + enum { + ADA_VERSION_MAJOR = 0, + ADA_VERSION_MINOR = 1, + ADA_VERSION_REVISION = 0, + }; + +} // namespace ada + +#endif // ADA_ADA_VERSION_H +/* end file include/ada/ada_version.h */ +// dofile: invoked with prepath=/Users/yagiz/Developer/url-parser/include, filename=ada/implementation.h +/* begin file include/ada/implementation.h */ +/** + * @file implementation.h + * + * @brief Definitions for user facing functions for parsing URL and it's components. + */ +#ifndef ADA_IMPLEMENTATION_H +#define ADA_IMPLEMENTATION_H + +#include +#include + + +namespace ada { + + /** + * The URL parser takes a scalar value string input, with an optional null or base URL base (default null) + * and an optional encoding encoding (default UTF-8). + * + * @param input the string input to analyze. + * @param base_url the optional string input to use as a base url. + * @param encoding encoding (default to UTF-8) + * + * @example + * + * ```cpp + * auto url = ada::url parse("https://www.google.com"); + * ``` + */ + ada_warn_unused ada::url parse(std::string_view input, + const ada::url* base_url = nullptr, + ada::encoding_type encoding = ada::encoding_type::UTF8); + +} + +#endif // ADA_IMPLEMENTATION_H +/* end file include/ada/implementation.h */ + +#endif // ADA_H +/* end file include/ada.h */ diff --git a/lib/internal/url.js b/lib/internal/url.js index 8aca33b68f61ec..f3cd32d538fa93 100644 --- a/lib/internal/url.js +++ b/lib/internal/url.js @@ -16,11 +16,9 @@ const { ObjectGetOwnPropertySymbols, ObjectGetPrototypeOf, ObjectKeys, - ReflectApply, ReflectGetOwnPropertyDescriptor, ReflectOwnKeys, RegExpPrototypeSymbolReplace, - String, StringPrototypeCharAt, StringPrototypeCharCodeAt, StringPrototypeCodePointAt, @@ -90,25 +88,8 @@ const isWindows = platform === 'win32'; const { domainToASCII: _domainToASCII, domainToUnicode: _domainToUnicode, - encodeAuth, parse, - setURLConstructor, - URL_FLAGS_CANNOT_BE_BASE, - URL_FLAGS_HAS_FRAGMENT, - URL_FLAGS_HAS_HOST, - URL_FLAGS_HAS_PASSWORD, - URL_FLAGS_HAS_PATH, - URL_FLAGS_HAS_QUERY, - URL_FLAGS_HAS_USERNAME, - URL_FLAGS_IS_DEFAULT_SCHEME_PORT, - URL_FLAGS_SPECIAL, - kFragment, - kHost, - kHostname, - kPathStart, - kPort, - kQuery, - kSchemeStart + updateUrl, } = internalBinding('url'); const { @@ -119,13 +100,21 @@ const { const FORWARD_SLASH = /\//g; const context = Symbol('context'); -const cannotBeBase = Symbol('cannot-be-base'); -const cannotHaveUsernamePasswordPort = - Symbol('cannot-have-username-password-port'); -const special = Symbol('special'); const searchParams = Symbol('query'); const kFormat = Symbol('format'); +const updateActions = { + kProtocol: 0, + kHost: 1, + kHostname: 2, + kPort: 3, + kUsername: 4, + kPassword: 5, + kPathname: 6, + kSearch: 7, + kHash: 8, + kHref: 9, +}; let blob; let cryptoRandom; @@ -144,14 +133,6 @@ function lazyCryptoRandom() { return cryptoRandom; } -// Refs: https://html.spec.whatwg.org/multipage/browsers.html#concept-origin-opaque -const kOpaqueOrigin = 'null'; - -// Refs: https://html.spec.whatwg.org/multipage/browsers.html#ascii-serialisation-of-an-origin -function serializeTupleOrigin(scheme, host, port) { - return `${scheme}//${host}${port === null ? '' : `:${port}`}`; -} - // This class provides the internal state of a URL object. An instance of this // class is stored in every URL object and is accessed internally by setters // and getters. It roughly corresponds to the concept of a URL record in the @@ -160,15 +141,17 @@ function serializeTupleOrigin(scheme, host, port) { // Refs: https://url.spec.whatwg.org/#concept-url class URLContext { constructor() { - this.flags = 0; - this.scheme = ':'; + this.href = ''; + this.origin = ''; + this.protocol = ''; + this.host = ''; + this.hostname = ''; + this.pathname = ''; + this.search = ''; this.username = ''; this.password = ''; - this.host = null; - this.port = null; - this.path = []; - this.query = null; - this.fragment = null; + this.port = ''; + this.hash = ''; } } @@ -246,7 +229,6 @@ class URLSearchParams { } else { // USVString init = toUSVString(init); - if (init[0] === '?') init = init.slice(1); initSearchParams(this, init); } @@ -542,125 +524,72 @@ ObjectDefineProperties(URLSearchParams.prototype, { }, }); -function onParseComplete(flags, protocol, username, password, - host, port, path, query, fragment) { +function onParseComplete(href, origin, protocol, host, hostname, pathname, search, username, password, port, hash) { const ctx = this[context]; - ctx.flags = flags; - ctx.scheme = protocol; - ctx.username = (flags & URL_FLAGS_HAS_USERNAME) !== 0 ? username : ''; - ctx.password = (flags & URL_FLAGS_HAS_PASSWORD) !== 0 ? password : ''; - ctx.port = port; - ctx.path = (flags & URL_FLAGS_HAS_PATH) !== 0 ? path : []; - ctx.query = query; - ctx.fragment = fragment; + ctx.href = href; + ctx.origin = origin; + ctx.protocol = protocol; ctx.host = host; + ctx.hostname = hostname; + ctx.pathname = pathname; + ctx.search = search; + ctx.username = username; + ctx.password = password; + ctx.port = port; + ctx.hash = hash; if (!this[searchParams]) { // Invoked from URL constructor this[searchParams] = new URLSearchParams(); this[searchParams][context] = this; } - initSearchParams(this[searchParams], query); -} - -function onParseError(input, flags) { - throw new ERR_INVALID_URL(input); -} - -function onParseProtocolComplete(flags, protocol, username, password, - host, port, path, query, fragment) { - const ctx = this[context]; - if ((flags & URL_FLAGS_SPECIAL) !== 0) { - ctx.flags |= URL_FLAGS_SPECIAL; - } else { - ctx.flags &= ~URL_FLAGS_SPECIAL; - } - ctx.scheme = protocol; - ctx.port = port; -} - -function onParseHostnameComplete(flags, protocol, username, password, - host, port, path, query, fragment) { - const ctx = this[context]; - if ((flags & URL_FLAGS_HAS_HOST) !== 0) { - ctx.host = host; - ctx.flags |= URL_FLAGS_HAS_HOST; - } else { - ctx.host = null; - ctx.flags &= ~URL_FLAGS_HAS_HOST; - } -} - -function onParsePortComplete(flags, protocol, username, password, - host, port, path, query, fragment) { - this[context].port = port; -} - -function onParseHostComplete(flags, protocol, username, password, - host, port, path, query, fragment) { - ReflectApply(onParseHostnameComplete, this, arguments); - if (port !== null || ((flags & URL_FLAGS_IS_DEFAULT_SCHEME_PORT) !== 0)) - ReflectApply(onParsePortComplete, this, arguments); -} - -function onParsePathComplete(flags, protocol, username, password, - host, port, path, query, fragment) { - const ctx = this[context]; - if ((flags & URL_FLAGS_HAS_PATH) !== 0) { - ctx.path = path; - ctx.flags |= URL_FLAGS_HAS_PATH; - } else { - ctx.path = []; - ctx.flags &= ~URL_FLAGS_HAS_PATH; - } - - // The C++ binding may set host to empty string. - if ((flags & URL_FLAGS_HAS_HOST) !== 0) { - ctx.host = host; - ctx.flags |= URL_FLAGS_HAS_HOST; - } -} - -function onParseSearchComplete(flags, protocol, username, password, - host, port, path, query, fragment) { - this[context].query = query; -} - -function onParseHashComplete(flags, protocol, username, password, - host, port, path, query, fragment) { - this[context].fragment = fragment; + initSearchParams(this[searchParams], ctx.search); } function isURLThis(self) { return (self !== undefined && self !== null && self[context] !== undefined); } +function constructHref(ctx) { + let ret = ctx.protocol; + if (ctx.host !== '') { + ret += '//'; + const has_username = ctx.username !== ''; + const has_password = ctx.password !== ''; + if (has_username || has_password) { + if (has_username) + ret += ctx.username; + if (has_password) + ret += `:${ctx.password}`; + ret += '@'; + } + ret += ctx.host; + } + ret += ctx.pathname; + if (ctx.search.length > 0) + ret += ctx.search; + if (ctx.hash.length > 0) + ret += ctx.hash; + return ret; +} + class URL { constructor(input, base = undefined) { // toUSVString is not needed. input = `${input}`; - let base_context; - if (base !== undefined) { - base_context = new URL(base)[context]; - } this[context] = new URLContext(); - parse(input, -1, base_context, undefined, - FunctionPrototypeBind(onParseComplete, this), - FunctionPrototypeBind(onParseError, this, input)); - } - get [special]() { - return (this[context].flags & URL_FLAGS_SPECIAL) !== 0; - } + if (base != null) { + if (typeof base === 'object') { + base = base.href; + } + } - get [cannotBeBase]() { - return (this[context].flags & URL_FLAGS_CANNOT_BE_BASE) !== 0; - } + const isValid = parse(input, + base, + FunctionPrototypeBind(onParseComplete, this)); - // https://url.spec.whatwg.org/#cannot-have-a-username-password-port - get [cannotHaveUsernamePasswordPort]() { - const { host, scheme } = this[context]; - return ((host == null || host === '') || - this[cannotBeBase] || - scheme === 'file:'); + if (!isValid) { + throw new ERR_INVALID_URL(input); + } } [inspect.custom](depth, opts) { @@ -689,8 +618,6 @@ class URL { obj.hash = this.hash; if (opts.showHidden) { - obj.cannotBeBase = this[cannotBeBase]; - obj.special = this[special]; obj[context] = this[context]; } @@ -710,8 +637,8 @@ class URL { }; const ctx = this[context]; // https://url.spec.whatwg.org/#url-serializing - let ret = ctx.scheme; - if (ctx.host !== null) { + let ret = ctx.protocol; + if (ctx.host !== '') { ret += '//'; const has_username = ctx.username !== ''; const has_password = ctx.password !== ''; @@ -723,24 +650,15 @@ class URL { ret += '@'; } ret += options.unicode ? - domainToUnicode(ctx.host) : ctx.host; - if (ctx.port !== null) + domainToUnicode(ctx.hostname) : ctx.hostname; + if (ctx.port !== '') ret += `:${ctx.port}`; } - if (this[cannotBeBase]) { - ret += ctx.path[0]; - } else { - if (ctx.host === null && ctx.path.length > 1 && ctx.path[0] === '') { - ret += '/.'; - } - if (ctx.path.length) { - ret += '/' + ArrayPrototypeJoin(ctx.path, '/'); - } - } - if (options.search && ctx.query !== null) - ret += `?${ctx.query}`; - if (options.fragment && ctx.fragment !== null) - ret += `#${ctx.fragment}`; + ret += ctx.pathname; + if (options.search && ctx.search.length > 0) + ret += ctx.search; + if (options.fragment && ctx.hash.length > 0) + ret += ctx.hash; return ret; } @@ -748,67 +666,39 @@ class URL { toString() { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - return this[kFormat](); + return this[context].href; } get href() { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - return this[kFormat](); + return this[context].href; } - set href(input) { + set href(value) { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - // toUSVString is not needed. - input = `${input}`; - parse(input, -1, undefined, undefined, - FunctionPrototypeBind(onParseComplete, this), - FunctionPrototypeBind(onParseError, this, input)); + const valid = updateUrl(this.href, updateActions.kHref, `${value}`, FunctionPrototypeBind(onParseComplete, this)); + if (!valid) { throw ERR_INVALID_URL(`${value}`); } } // readonly get origin() { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - // Refs: https://url.spec.whatwg.org/#concept-url-origin - const ctx = this[context]; - switch (ctx.scheme) { - case 'blob:': - if (ctx.path.length > 0) { - try { - return (new URL(ctx.path[0])).origin; - } catch { - // Fall through... do nothing - } - } - return kOpaqueOrigin; - case 'ftp:': - case 'http:': - case 'https:': - case 'ws:': - case 'wss:': - return serializeTupleOrigin(ctx.scheme, ctx.host, ctx.port); - } - return kOpaqueOrigin; + return this[context].origin; } get protocol() { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - return this[context].scheme; + return this[context].protocol; } - set protocol(scheme) { + set protocol(value) { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - // toUSVString is not needed. - scheme = `${scheme}`; - if (scheme.length === 0) - return; - const ctx = this[context]; - parse(scheme, kSchemeStart, null, ctx, - FunctionPrototypeBind(onParseProtocolComplete, this)); + updateUrl(this.href, updateActions.kProtocol, `${value}`, FunctionPrototypeBind(onParseComplete, this)); } get username() { @@ -817,21 +707,10 @@ class URL { return this[context].username; } - set username(username) { + set username(value) { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - // toUSVString is not needed. - username = `${username}`; - if (this[cannotHaveUsernamePasswordPort]) - return; - const ctx = this[context]; - if (username === '') { - ctx.username = ''; - ctx.flags &= ~URL_FLAGS_HAS_USERNAME; - return; - } - ctx.username = encodeAuth(username); - ctx.flags |= URL_FLAGS_HAS_USERNAME; + updateUrl(this.href, updateActions.kUsername, `${value}`, FunctionPrototypeBind(onParseComplete, this)); } get password() { @@ -840,138 +719,72 @@ class URL { return this[context].password; } - set password(password) { + set password(value) { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - // toUSVString is not needed. - password = `${password}`; - if (this[cannotHaveUsernamePasswordPort]) - return; - const ctx = this[context]; - if (password === '') { - ctx.password = ''; - ctx.flags &= ~URL_FLAGS_HAS_PASSWORD; - return; - } - ctx.password = encodeAuth(password); - ctx.flags |= URL_FLAGS_HAS_PASSWORD; + updateUrl(this.href, updateActions.kPassword, `${value}`, FunctionPrototypeBind(onParseComplete, this)); } get host() { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - const ctx = this[context]; - let ret = ctx.host || ''; - if (ctx.port !== null) - ret += `:${ctx.port}`; - return ret; + return this[context].host; } - set host(host) { + set host(value) { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - const ctx = this[context]; - // toUSVString is not needed. - host = `${host}`; - if (this[cannotBeBase]) { - // Cannot set the host if cannot-be-base is set - return; - } - parse(host, kHost, null, ctx, - FunctionPrototypeBind(onParseHostComplete, this)); + updateUrl(this.href, updateActions.kHost, `${value}`, FunctionPrototypeBind(onParseComplete, this)); } get hostname() { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - return this[context].host || ''; + return this[context].hostname; } - set hostname(host) { + set hostname(value) { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - const ctx = this[context]; - // toUSVString is not needed. - host = `${host}`; - if (this[cannotBeBase]) { - // Cannot set the host if cannot-be-base is set - return; - } - parse(host, kHostname, null, ctx, onParseHostnameComplete.bind(this)); + updateUrl(this.href, updateActions.kHostname, `${value}`, FunctionPrototypeBind(onParseComplete, this)); } get port() { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - const port = this[context].port; - return port === null ? '' : String(port); + return this[context].port; } - set port(port) { + set port(value) { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - // toUSVString is not needed. - port = `${port}`; - if (this[cannotHaveUsernamePasswordPort]) - return; - const ctx = this[context]; - if (port === '') { - ctx.port = null; - return; - } - parse(port, kPort, null, ctx, - FunctionPrototypeBind(onParsePortComplete, this)); + updateUrl(this.href, updateActions.kPort, `${value}`, FunctionPrototypeBind(onParseComplete, this)); } get pathname() { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - const ctx = this[context]; - if (this[cannotBeBase]) - return ctx.path[0]; - if (ctx.path.length === 0) - return ''; - return `/${ArrayPrototypeJoin(ctx.path, '/')}`; + return this[context].pathname; } - set pathname(path) { + set pathname(value) { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - // toUSVString is not needed. - path = `${path}`; - if (this[cannotBeBase]) - return; - parse(path, kPathStart, null, this[context], - onParsePathComplete.bind(this)); + updateUrl(this.href, updateActions.kPathname, `${value}`, FunctionPrototypeBind(onParseComplete, this)); } get search() { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - const { query } = this[context]; - if (query === null || query === '') - return ''; - return `?${query}`; + return this[context].search; } set search(search) { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - const ctx = this[context]; search = toUSVString(search); - if (search === '') { - ctx.query = null; - ctx.flags &= ~URL_FLAGS_HAS_QUERY; - } else { - if (search[0] === '?') search = StringPrototypeSlice(search, 1); - ctx.query = ''; - ctx.flags |= URL_FLAGS_HAS_QUERY; - if (search) { - parse(search, kQuery, null, ctx, - FunctionPrototypeBind(onParseSearchComplete, this)); - } - } - initSearchParams(this[searchParams], search); + updateUrl(this.href, updateActions.kSearch, search, FunctionPrototypeBind(onParseComplete, this)); + initSearchParams(this[searchParams], this[context].search); } // readonly @@ -984,28 +797,13 @@ class URL { get hash() { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - const { fragment } = this[context]; - if (fragment === null || fragment === '') - return ''; - return `#${fragment}`; + return this[context].hash; } - set hash(hash) { + set hash(value) { if (!isURLThis(this)) throw new ERR_INVALID_THIS('URL'); - const ctx = this[context]; - // toUSVString is not needed. - hash = `${hash}`; - if (!hash) { - ctx.fragment = null; - ctx.flags &= ~URL_FLAGS_HAS_FRAGMENT; - return; - } - if (hash[0] === '#') hash = StringPrototypeSlice(hash, 1); - ctx.fragment = ''; - ctx.flags |= URL_FLAGS_HAS_FRAGMENT; - parse(hash, kFragment, null, ctx, - FunctionPrototypeBind(onParseHashComplete, this)); + updateUrl(this.href, updateActions.kHash, `${value}`, FunctionPrototypeBind(onParseComplete, this)); } toJSON() { @@ -1036,6 +834,7 @@ class URL { static revokeObjectURL(url) { url = `${url}`; try { + // TODO(@anonrig): Remove this try/catch by calling `parse` directly. const parsed = new URL(url); const split = StringPrototypeSplit(parsed.pathname, ':'); if (split.length === 2) @@ -1076,13 +875,12 @@ function update(url, params) { const ctx = url[context]; const serializedParams = params.toString(); - if (serializedParams) { - ctx.query = serializedParams; - ctx.flags |= URL_FLAGS_HAS_QUERY; + if (serializedParams.length > 0) { + ctx.search = '?' + serializedParams; } else { - ctx.query = null; - ctx.flags &= ~URL_FLAGS_HAS_QUERY; + ctx.search = ''; } + ctx.href = constructHref(ctx); } function initSearchParams(url, init) { @@ -1097,14 +895,14 @@ function initSearchParams(url, init) { // Ref: https://url.spec.whatwg.org/#concept-urlencoded-parser function parseParams(qs) { const out = []; - let pairStart = 0; - let lastPos = 0; let seenSep = false; let buf = ''; let encoded = false; let encodeCheck = 0; - let i; - for (i = 0; i < qs.length; ++i) { + let i = qs[0] === '?' ? 1 : 0; + let pairStart = i; + let lastPos = i; + for (; i < qs.length; ++i) { const code = StringPrototypeCharCodeAt(qs, i); // Try matching key/value pair separator @@ -1572,29 +1370,6 @@ function toPathIfFileURL(fileURLOrPath) { return fileURLToPath(fileURLOrPath); } -function constructUrl(flags, protocol, username, password, - host, port, path, query, fragment) { - const ctx = new URLContext(); - ctx.flags = flags; - ctx.scheme = protocol; - ctx.username = (flags & URL_FLAGS_HAS_USERNAME) !== 0 ? username : ''; - ctx.password = (flags & URL_FLAGS_HAS_PASSWORD) !== 0 ? password : ''; - ctx.port = port; - ctx.path = (flags & URL_FLAGS_HAS_PATH) !== 0 ? path : []; - ctx.query = query; - ctx.fragment = fragment; - ctx.host = host; - - const url = { __proto__: URL.prototype }; - url[context] = ctx; - const params = new URLSearchParams(); - url[searchParams] = params; - params[context] = url; - initSearchParams(params, query); - return url; -} -setURLConstructor(constructUrl); - module.exports = { toUSVString, fileURLToPath, diff --git a/node.gyp b/node.gyp index f5da2fa3599ed1..18e98a3a7305aa 100644 --- a/node.gyp +++ b/node.gyp @@ -457,6 +457,7 @@ 'deps/histogram/histogram.gyp:histogram', 'deps/uvwasi/uvwasi.gyp:uvwasi', 'deps/simdutf/simdutf.gyp:simdutf', + 'deps/ada/ada.gyp:ada', ], 'sources': [ @@ -532,7 +533,6 @@ 'src/node_trace_events.cc', 'src/node_types.cc', 'src/node_url.cc', - 'src/node_url_tables.cc', 'src/node_util.cc', 'src/node_v8.cc', 'src/node_wasi.cc', @@ -893,37 +893,6 @@ }, ], }, # node_lib_target_name - { # fuzz_url - 'target_name': 'fuzz_url', - 'type': 'executable', - 'dependencies': [ - '<(node_lib_target_name)', - ], - 'includes': [ - 'node.gypi' - ], - 'include_dirs': [ - 'src', - ], - 'defines': [ - 'NODE_ARCH="<(target_arch)"', - 'NODE_PLATFORM="<(OS)"', - 'NODE_WANT_INTERNALS=1', - ], - 'sources': [ - 'src/node_snapshot_stub.cc', - 'test/fuzzers/fuzz_url.cc', - ], - 'conditions': [ - ['OS=="linux"', { - 'ldflags': [ '-fsanitize=fuzzer' ] - }], - # Ensure that ossfuzz flag has been set and that we are on Linux - [ 'OS!="linux" or ossfuzz!="true"', { - 'type': 'none', - }], - ], - }, # fuzz_url { # fuzz_env 'target_name': 'fuzz_env', 'type': 'executable', @@ -975,6 +944,7 @@ 'deps/histogram/histogram.gyp:histogram', 'deps/uvwasi/uvwasi.gyp:uvwasi', 'deps/simdutf/simdutf.gyp:simdutf', + 'deps/ada/ada.gyp:ada', ], 'includes': [ @@ -1015,7 +985,6 @@ 'test/cctest/test_sockaddr.cc', 'test/cctest/test_traced_value.cc', 'test/cctest/test_util.cc', - 'test/cctest/test_url.cc', ], 'conditions': [ @@ -1071,6 +1040,7 @@ '<(node_lib_target_name)', 'deps/histogram/histogram.gyp:histogram', 'deps/uvwasi/uvwasi.gyp:uvwasi', + 'deps/ada/ada.gyp:ada', ], 'includes': [ diff --git a/src/crypto/crypto_common.cc b/src/crypto/crypto_common.cc index a1d0dfc16ce235..d405c503f91d5a 100644 --- a/src/crypto/crypto_common.cc +++ b/src/crypto/crypto_common.cc @@ -1,13 +1,12 @@ +#include "crypto/crypto_common.h" #include "base_object-inl.h" #include "env-inl.h" +#include "memory_tracker-inl.h" +#include "node.h" #include "node_buffer.h" #include "node_crypto.h" -#include "crypto/crypto_common.h" -#include "node.h" #include "node_internals.h" -#include "node_url.h" #include "string_bytes.h" -#include "memory_tracker-inl.h" #include "v8.h" #include diff --git a/src/inspector_agent.cc b/src/inspector_agent.cc index decd9f748a2775..05eed333d01703 100644 --- a/src/inspector_agent.cc +++ b/src/inspector_agent.cc @@ -650,8 +650,8 @@ class NodeInspectorClient : public V8InspectorClient { protocol::StringUtil::StringViewToUtf8(resource_name_view); if (!IsFilePath(resource_name)) return nullptr; - node::url::URL url = node::url::URL::FromFilePath(resource_name); - return Utf8ToStringView(url.href()); + ; + return Utf8ToStringView(node::url::FromFilePath(resource_name).get_href()); } node::Environment* env_; diff --git a/src/module_wrap.cc b/src/module_wrap.cc index d6c766244fd416..9b2b0b8334d102 100644 --- a/src/module_wrap.cc +++ b/src/module_wrap.cc @@ -7,7 +7,6 @@ #include "node_external_reference.h" #include "node_internals.h" #include "node_process-inl.h" -#include "node_url.h" #include "node_watchdog.h" #include "util-inl.h" @@ -21,8 +20,6 @@ namespace loader { using errors::TryCatchScope; using node::contextify::ContextifyContext; -using node::url::URL; -using node::url::URL_FLAGS_FAILED; using v8::Array; using v8::ArrayBufferView; using v8::Context; diff --git a/src/node_api.cc b/src/node_api.cc index 49234a23dce800..19cda764565fee 100644 --- a/src/node_api.cc +++ b/src/node_api.cc @@ -657,7 +657,7 @@ void napi_module_register_by_symbol(v8::Local exports, // a file system path. // TODO(gabrielschulhof): Pass the `filename` through unchanged if/when we // receive it as a URL already. - module_filename = node::url::URL::FromFilePath(filename.ToString()).href(); + module_filename = node::url::FromFilePath(filename.ToString()).get_href(); } // Create a new napi_env for this specific module. diff --git a/src/node_url.cc b/src/node_url.cc index b8edc0c21d7e5b..8b058bfe04d952 100644 --- a/src/node_url.cc +++ b/src/node_url.cc @@ -1,28 +1,17 @@ -#include "node_url.h" +#include "ada.h" #include "base_object-inl.h" #include "node_errors.h" #include "node_external_reference.h" #include "node_i18n.h" #include "util-inl.h" -#include #include #include -#include -#include namespace node { using errors::TryCatchScope; -using url::table_data::hex; -using url::table_data::C0_CONTROL_ENCODE_SET; -using url::table_data::FRAGMENT_ENCODE_SET; -using url::table_data::PATH_ENCODE_SET; -using url::table_data::USERINFO_ENCODE_SET; -using url::table_data::QUERY_ENCODE_SET_NONSPECIAL; -using url::table_data::QUERY_ENCODE_SET_SPECIAL; - using v8::Array; using v8::Context; using v8::Function; @@ -50,1785 +39,214 @@ Local Utf8String(Isolate* isolate, const std::string& str) { namespace url { namespace { -// https://url.spec.whatwg.org/#eof-code-point -constexpr char kEOL = -1; - -// https://url.spec.whatwg.org/#concept-host -class URLHost { - public: - ~URLHost(); - - void ParseIPv4Host(const char* input, size_t length); - void ParseIPv6Host(const char* input, size_t length); - void ParseOpaqueHost(const char* input, size_t length); - void ParseHost(const char* input, - size_t length, - bool is_special, - bool unicode = false); - - bool ParsingFailed() const { return type_ == HostType::H_FAILED; } - std::string ToString() const; - // Like ToString(), but avoids a copy in exchange for invalidating `*this`. - std::string ToStringMove(); - - private: - enum class HostType { - H_FAILED, - H_DOMAIN, - H_IPV4, - H_IPV6, - H_OPAQUE, - }; - - union Value { - std::string domain_or_opaque; - uint32_t ipv4; - uint16_t ipv6[8]; - - ~Value() {} - Value() : ipv4(0) {} - }; - - Value value_; - HostType type_ = HostType::H_FAILED; - - void Reset() { - using string = std::string; - switch (type_) { - case HostType::H_DOMAIN: - case HostType::H_OPAQUE: - value_.domain_or_opaque.~string(); - break; - default: - break; - } - type_ = HostType::H_FAILED; - } - - // Setting the string members of the union with = is brittle because - // it relies on them being initialized to a state that requires no - // destruction of old data. - // For a long time, that worked well enough because ParseIPv6Host() happens - // to zero-fill `value_`, but that really is relying on standard library - // internals too much. - // These helpers are the easiest solution but we might want to consider - // just not forcing strings into an union. - void SetOpaque(std::string&& string) { - Reset(); - type_ = HostType::H_OPAQUE; - new(&value_.domain_or_opaque) std::string(std::move(string)); - } - - void SetDomain(std::string&& string) { - Reset(); - type_ = HostType::H_DOMAIN; - new(&value_.domain_or_opaque) std::string(std::move(string)); - } -}; - -URLHost::~URLHost() { - Reset(); -} - -#define ARGS(XX) \ - XX(ARG_FLAGS) \ - XX(ARG_PROTOCOL) \ - XX(ARG_USERNAME) \ - XX(ARG_PASSWORD) \ - XX(ARG_HOST) \ - XX(ARG_PORT) \ - XX(ARG_PATH) \ - XX(ARG_QUERY) \ - XX(ARG_FRAGMENT) \ - XX(ARG_COUNT) // This one has to be last. - -enum url_cb_args { -#define XX(name) name, - ARGS(XX) -#undef XX +enum url_update_action { + kProtocol = 0, + kHost = 1, + kHostname = 2, + kPort = 3, + kUsername = 4, + kPassword = 5, + kPathname = 6, + kSearch = 7, + kHash = 8, + kHref = 9, }; -#define TWO_CHAR_STRING_TEST(bits, name, expr) \ - template \ - bool name(const T ch1, const T ch2) { \ - static_assert(sizeof(ch1) >= (bits) / 8, \ - "Character must be wider than " #bits " bits"); \ - return (expr); \ - } \ - template \ - bool name(const std::basic_string& str) { \ - static_assert(sizeof(str[0]) >= (bits) / 8, \ - "Character must be wider than " #bits " bits"); \ - return str.length() >= 2 && name(str[0], str[1]); \ - } - -// https://infra.spec.whatwg.org/#ascii-tab-or-newline -CHAR_TEST(8, IsASCIITabOrNewline, (ch == '\t' || ch == '\n' || ch == '\r')) - -// https://infra.spec.whatwg.org/#c0-control -CHAR_TEST(8, IsC0Control, (ch >= '\0' && ch <= '\x1f')) - -// https://infra.spec.whatwg.org/#c0-control-or-space -CHAR_TEST(8, IsC0ControlOrSpace, (ch >= '\0' && ch <= ' ')) - -// https://infra.spec.whatwg.org/#ascii-digit -CHAR_TEST(8, IsASCIIDigit, (ch >= '0' && ch <= '9')) - -CHAR_TEST(8, IsASCIIOcDigit, (ch >= '0' && ch <= '7')) - -// https://infra.spec.whatwg.org/#ascii-hex-digit -CHAR_TEST(8, IsASCIIHexDigit, (IsASCIIDigit(ch) || - (ch >= 'A' && ch <= 'F') || - (ch >= 'a' && ch <= 'f'))) - -// https://infra.spec.whatwg.org/#ascii-alpha -CHAR_TEST(8, IsASCIIAlpha, ((ch >= 'A' && ch <= 'Z') || - (ch >= 'a' && ch <= 'z'))) - -// https://infra.spec.whatwg.org/#ascii-alphanumeric -CHAR_TEST(8, IsASCIIAlphanumeric, (IsASCIIDigit(ch) || IsASCIIAlpha(ch))) - -// https://infra.spec.whatwg.org/#ascii-lowercase -template -T ASCIILowercase(T ch) { - return IsASCIIAlpha(ch) ? (ch | 0x20) : ch; -} - -// https://url.spec.whatwg.org/#forbidden-host-code-point -CHAR_TEST(8, - IsForbiddenHostCodePoint, - ch == '\0' || ch == '\t' || ch == '\n' || ch == '\r' || ch == ' ' || - ch == '#' || ch == '/' || ch == ':' || ch == '?' || ch == '@' || - ch == '[' || ch == '<' || ch == '>' || ch == '\\' || ch == ']' || - ch == '^' || ch == '|') - -// https://url.spec.whatwg.org/#forbidden-domain-code-point -CHAR_TEST(8, - IsForbiddenDomainCodePoint, - IsForbiddenHostCodePoint(ch) || IsC0Control(ch) || ch == '%' || - ch == '\x7f') - -// https://url.spec.whatwg.org/#windows-drive-letter -TWO_CHAR_STRING_TEST(8, IsWindowsDriveLetter, - (IsASCIIAlpha(ch1) && (ch2 == ':' || ch2 == '|'))) - -// https://url.spec.whatwg.org/#normalized-windows-drive-letter -TWO_CHAR_STRING_TEST(8, IsNormalizedWindowsDriveLetter, - (IsASCIIAlpha(ch1) && ch2 == ':')) - -#undef TWO_CHAR_STRING_TEST - -bool BitAt(const uint8_t a[], const uint8_t i) { - return !!(a[i >> 3] & (1 << (i & 7))); -} - -// Appends ch to str. If ch position in encode_set is set, the ch will -// be percent-encoded then appended. -void AppendOrEscape(std::string* str, - const unsigned char ch, - const uint8_t encode_set[]) { - if (BitAt(encode_set, ch)) - *str += hex + ch * 4; // "%XX\0" has a length of 4 - else - *str += ch; -} - -unsigned hex2bin(const char ch) { - if (ch >= '0' && ch <= '9') - return ch - '0'; - if (ch >= 'A' && ch <= 'F') - return 10 + (ch - 'A'); - if (ch >= 'a' && ch <= 'f') - return 10 + (ch - 'a'); - UNREACHABLE(); -} - -std::string PercentDecode(const char* input, size_t len) { - std::string dest; - if (len == 0) - return dest; - dest.reserve(len); - const char* pointer = input; - const char* end = input + len; - - while (pointer < end) { - const char ch = pointer[0]; - size_t remaining = end - pointer - 1; - if (ch != '%' || remaining < 2 || - (ch == '%' && - (!IsASCIIHexDigit(pointer[1]) || - !IsASCIIHexDigit(pointer[2])))) { - dest += ch; - pointer++; - continue; - } else { - unsigned a = hex2bin(pointer[1]); - unsigned b = hex2bin(pointer[2]); - char c = static_cast(a * 16 + b); - dest += c; - pointer += 3; - } - } - return dest; -} - -#define SPECIALS(XX) \ - XX(ftp, 21, "ftp:") \ - XX(file, -1, "file:") \ - XX(http, 80, "http:") \ - XX(https, 443, "https:") \ - XX(ws, 80, "ws:") \ - XX(wss, 443, "wss:") - -bool IsSpecial(const std::string& scheme) { -#define V(_, __, name) if (scheme == name) return true; - SPECIALS(V); -#undef V - return false; -} - -Local GetSpecial(Environment* env, const std::string& scheme) { -#define V(key, _, name) if (scheme == name) \ - return env->url_special_##key##_string(); - SPECIALS(V) -#undef V - UNREACHABLE(); -} - -int NormalizePort(const std::string& scheme, int p) { -#define V(_, port, name) if (scheme == name && p == port) return -1; - SPECIALS(V); -#undef V - return p; -} - -// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter -bool StartsWithWindowsDriveLetter(const char* p, const char* end) { - size_t length = end - p; - return length >= 2 && - IsWindowsDriveLetter(p[0], p[1]) && - (length == 2 || - p[2] == '/' || - p[2] == '\\' || - p[2] == '?' || - p[2] == '#'); -} - -#if defined(NODE_HAVE_I18N_SUPPORT) -bool ToUnicode(const std::string& input, std::string* output) { - MaybeStackBuffer buf; - if (i18n::ToUnicode(&buf, input.c_str(), input.length()) < 0) - return false; - output->assign(*buf, buf.length()); - return true; -} - -bool ToASCII(const std::string& input, std::string* output) { - MaybeStackBuffer buf; - if (i18n::ToASCII(&buf, input.c_str(), input.length()) < 0) - return false; - if (buf.length() == 0) - return false; - output->assign(*buf, buf.length()); - return true; -} -#else // !defined(NODE_HAVE_I18N_SUPPORT) -// Intentional non-ops if ICU is not present. -bool ToUnicode(const std::string& input, std::string* output) { - *output = input; - return true; -} - -bool ToASCII(const std::string& input, std::string* output) { - *output = input; - return true; +void SetArgs(Environment* env, Local argv[10], const ada::url& url) { + Isolate* isolate = env->isolate(); + argv[0] = Utf8String(isolate, url.get_href()); + argv[1] = Utf8String(isolate, url.get_origin()); + argv[2] = Utf8String(isolate, url.get_protocol()); + argv[3] = Utf8String(isolate, url.get_host()); + argv[4] = Utf8String(isolate, url.get_hostname()); + argv[5] = Utf8String(isolate, url.get_pathname()); + argv[6] = Utf8String(isolate, url.get_search()); + argv[7] = Utf8String(isolate, url.get_username()); + argv[8] = Utf8String(isolate, url.get_password()); + argv[9] = Utf8String(isolate, url.get_port()); + argv[10] = Utf8String(isolate, url.get_hash()); } -#endif // !defined(NODE_HAVE_I18N_SUPPORT) - -#define NS_IN6ADDRSZ 16 -void URLHost::ParseIPv6Host(const char* input, size_t length) { - CHECK_EQ(type_, HostType::H_FAILED); - - unsigned char buf[sizeof(struct in6_addr)]; - MaybeStackBuffer ipv6(length + 1); - *(*ipv6 + length) = 0; - memset(buf, 0, sizeof(buf)); - memcpy(*ipv6, input, sizeof(const char) * length); +void Parse(const FunctionCallbackInfo& args) { + CHECK_GE(args.Length(), 3); + CHECK(args[0]->IsString()); // input + // args[1] // base url + CHECK(args[2]->IsFunction()); // complete callback - int ret = uv_inet_pton(AF_INET6, *ipv6, buf); + Local success_callback_ = args[2].As(); - if (ret != 0) { - return; - } + Environment* env = Environment::GetCurrent(args); + Isolate* isolate = env->isolate(); + HandleScope handle_scope(env->isolate()); + Context::Scope context_scope(env->context()); - // Ref: https://sourceware.org/git/?p=glibc.git;a=blob;f=resolv/inet_ntop.c;h=c4d38c0f951013e51a4fc6eaa8a9b82e146abe5a;hb=HEAD#l119 - for (int i = 0; i < NS_IN6ADDRSZ; i += 2) { - value_.ipv6[i >> 1] = (buf[i] << 8) | buf[i + 1]; + Utf8Value input(env->isolate(), args[0]); + ada::url base; + ada::url* base_pointer = nullptr; + if (args[1]->IsString()) { + base = ada::parse(Utf8Value(env->isolate(), args[1]).ToString()); + base_pointer = &base; } + ada::url out = + ada::parse(std::string_view(input.out(), input.length()), base_pointer); - type_ = HostType::H_IPV6; -} - -// https://url.spec.whatwg.org/#ipv4-number-parser -int64_t ParseIPv4Number(const char* start, const char* end) { - if (end - start == 0) return -1; - - unsigned R = 10; - if (end - start >= 2 && start[0] == '0' && (start[1] | 0x20) == 'x') { - start += 2; - R = 16; - } else if (end - start >= 2 && start[0] == '0') { - start++; - R = 8; + if (!out.is_valid) { + return args.GetReturnValue().Set(false); } - if (end - start == 0) return 0; - - const char* p = start; - - while (p < end) { - const char ch = p[0]; - switch (R) { - case 8: - if (ch < '0' || ch > '7') - return -1; - break; - case 10: - if (!IsASCIIDigit(ch)) - return -1; - break; - case 16: - if (!IsASCIIHexDigit(ch)) - return -1; - break; - } - p++; - } - return strtoll(start, nullptr, R); + const Local undef = Undefined(isolate); + Local argv[] = { + undef, + undef, + undef, + undef, + undef, + undef, + undef, + undef, + undef, + undef, + undef, + }; + SetArgs(env, argv, out); + USE(success_callback_->Call( + env->context(), args.This(), arraysize(argv), argv)); + args.GetReturnValue().Set(true); } -// https://url.spec.whatwg.org/#ipv4-number-parser -bool IsIPv4NumberValid(const std::string_view input) { - if (input.empty()) { - return false; - } - - // If a number starts with '0' it might be a number with base 8 or base - // 16. If not, checking if all characters are digits proves that it is a - // base 10 number. - if (input.size() >= 2 && input[0] == '0') { - if (input[1] == 'X' || input[1] == 'x') { - if (input.size() == 2) { - return true; - } - - return std::all_of(input.begin() + 2, input.end(), [](const char& c) { - return IsASCIIHexDigit(c); - }); - } - - return std::all_of(input.begin() + 1, input.end(), [](const char& c) { - return IsASCIIOcDigit(c); - }); - } - - return std::all_of(input.begin(), input.end(), [](const char& c) { - return IsASCIIDigit(c); - }); -} +void DomainToASCII(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + CHECK_GE(args.Length(), 1); + CHECK(args[0]->IsString()); -// https://url.spec.whatwg.org/#ends-in-a-number-checker -inline bool EndsInANumber(const std::string_view input) { + std::string input = Utf8Value(env->isolate(), args[0]).ToString(); if (input.empty()) { - return false; - } - - char delimiter = '.'; - auto last_index = input.size() - 1; - if (input.back() == delimiter) { - --last_index; - } - - std::string_view last{}; - auto pos = input.find_last_of(delimiter, last_index); - if (pos == std::string_view::npos) { - last = input.substr(0, last_index); - } else { - last = input.substr(pos + 1, last_index - pos); - } - - if (last.empty()) { - return false; - } - - if (std::all_of(last.begin(), last.end(), [](const char& c) { - return IsASCIIDigit(c); - })) { - return true; - } - - return IsIPv4NumberValid(last); -} - -void URLHost::ParseIPv4Host(const char* input, size_t length) { - CHECK_EQ(type_, HostType::H_FAILED); - const char* pointer = input; - const char* mark = input; - const char* end = pointer + length; - unsigned int parts = 0; - uint32_t val = 0; - uint64_t numbers[4]; - unsigned int tooBigNumbers = 0; - if (length == 0) - return; - - while (pointer <= end) { - const char ch = pointer < end ? pointer[0] : kEOL; - int64_t remaining = end - pointer - 1; - if (ch == '.' || ch == kEOL) { - if (++parts > arraysize(numbers)) return; - if (pointer == mark) - return; - int64_t n = ParseIPv4Number(mark, pointer); - if (n < 0) - return; - - if (n > 255) { - tooBigNumbers++; - } - numbers[parts - 1] = n; - mark = pointer + 1; - if (ch == '.' && remaining == 0) - break; - } - pointer++; - } - CHECK_GT(parts, 0); - - // If any but the last item in numbers is greater than 255, return failure. - // If the last item in numbers is greater than or equal to - // 256^(5 - the number of items in numbers), return failure. - if (tooBigNumbers > 1 || (tooBigNumbers == 1 && numbers[parts - 1] <= 255) || - numbers[parts - 1] >= UINT64_C(1) << (8 * (5 - parts))) { - return; - } - - type_ = HostType::H_IPV4; - val = static_cast(numbers[parts - 1]); - for (unsigned int n = 0; n < parts - 1; n++) { - val += static_cast(numbers[n]) << (8 * (3 - n)); + return args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), "")); } - value_.ipv4 = val; -} - -void URLHost::ParseOpaqueHost(const char* input, size_t length) { - CHECK_EQ(type_, HostType::H_FAILED); - std::string output; - output.reserve(length); - for (size_t i = 0; i < length; i++) { - const char ch = input[i]; - if (IsForbiddenHostCodePoint(ch)) { - return; - } else { - AppendOrEscape(&output, ch, C0_CONTROL_ENCODE_SET); - } + // It is important to have an initial value that contains a special scheme. + // Since it will change the implementation of `set_host` according to URL + // spec. + ada::url out = ada::parse("ws://"); + if (!out.set_host(input)) { + return args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), "")); } - - SetOpaque(std::move(output)); + std::string host = out.get_host(); + args.GetReturnValue().Set( + String::NewFromUtf8(env->isolate(), host.c_str()).ToLocalChecked()); } -void URLHost::ParseHost(const char* input, - size_t length, - bool is_special, - bool unicode) { - CHECK_EQ(type_, HostType::H_FAILED); - const char* pointer = input; - - if (length == 0) - return; +void DomainToUnicode(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + CHECK_GE(args.Length(), 1); + CHECK(args[0]->IsString()); - if (pointer[0] == '[') { - if (pointer[length - 1] != ']') - return; - return ParseIPv6Host(++pointer, length - 2); + std::string input = Utf8Value(env->isolate(), args[0]).ToString(); + if (input.empty()) { + return args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), "")); } - if (!is_special) - return ParseOpaqueHost(input, length); - - // First, we have to percent decode - std::string decoded = PercentDecode(input, length); - - // Then we have to punycode toASCII - if (!ToASCII(decoded, &decoded)) - return; - - // If any of the following characters are still present, we have to fail - for (size_t n = 0; n < decoded.size(); n++) { - const char ch = decoded[n]; - if (IsForbiddenDomainCodePoint(ch)) { - return; - } + // It is important to have an initial value that contains a special scheme. + // Since it will change the implementation of `set_host` according to URL + // spec. + ada::url out = ada::parse("ws://"); + if (!out.set_host(input)) { + return args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), "")); } + std::string host = out.get_host(); + MaybeStackBuffer buf; + int32_t len = i18n::ToUnicode(&buf, host.data(), host.length()); - // If domain ends in a number, then return the result of IPv4 parsing domain - if (EndsInANumber(decoded)) { - return ParseIPv4Host(decoded.c_str(), decoded.length()); + if (len < 0) { + return args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), "")); } - // If the unicode flag is set, run the result through punycode ToUnicode - if (unicode && !ToUnicode(decoded, &decoded)) - return; - - // It's not an IPv4 or IPv6 address, it must be a domain - SetDomain(std::move(decoded)); + args.GetReturnValue().Set( + String::NewFromUtf8(env->isolate(), *buf, NewStringType::kNormal, len) + .ToLocalChecked()); } -// Locates the longest sequence of 0 segments in an IPv6 address -// in order to use the :: compression when serializing -template -T* FindLongestZeroSequence(T* values, size_t len) { - T* start = values; - T* end = start + len; - T* result = nullptr; +void UpdateUrl(const FunctionCallbackInfo& args) { + CHECK(args[0]->IsString()); // href + CHECK(args[1]->IsNumber()); // action type + CHECK(args[2]->IsString()); // new value + CHECK(args[3]->IsFunction()); // success callback - T* current = nullptr; - unsigned counter = 0, longest = 1; - - while (start < end) { - if (*start == 0) { - if (current == nullptr) - current = start; - counter++; - } else { - if (counter > longest) { - longest = counter; - result = current; - } - counter = 0; - current = nullptr; - } - start++; - } - if (counter > longest) - result = current; - return result; -} - -std::string URLHost::ToStringMove() { - std::string return_value; - switch (type_) { - case HostType::H_DOMAIN: - case HostType::H_OPAQUE: - return_value = std::move(value_.domain_or_opaque); - break; - default: - return_value = ToString(); - break; - } - Reset(); - return return_value; -} + Environment* env = Environment::GetCurrent(args); + Isolate* isolate = env->isolate(); -std::string URLHost::ToString() const { - std::string dest; - switch (type_) { - case HostType::H_DOMAIN: - case HostType::H_OPAQUE: - return value_.domain_or_opaque; - case HostType::H_IPV4: { - dest.reserve(15); - uint32_t value = value_.ipv4; - for (int n = 0; n < 4; n++) { - dest.insert(0, std::to_string(value % 256)); - if (n < 3) - dest.insert(0, 1, '.'); - value /= 256; - } + enum url_update_action action = static_cast( + args[1]->Uint32Value(env->context()).FromJust()); + Utf8Value input(isolate, args[0].As()); + Utf8Value new_value(isolate, args[2].As()); + Local success_callback_ = args[3].As(); + + std::string_view new_value_view = + std::string_view(new_value.out(), new_value.length()); + std::string_view input_view = std::string_view(input.out(), input.length()); + ada::url out = ada::parse(input_view); + DCHECK(out.is_valid); + + switch (action) { + case kPathname: { + out.set_pathname(new_value_view); break; } - case HostType::H_IPV6: { - dest.reserve(41); - dest += '['; - const uint16_t* start = &value_.ipv6[0]; - const uint16_t* compress_pointer = - FindLongestZeroSequence(start, 8); - bool ignore0 = false; - for (int n = 0; n <= 7; n++) { - const uint16_t* piece = &value_.ipv6[n]; - if (ignore0 && *piece == 0) - continue; - else if (ignore0) - ignore0 = false; - if (compress_pointer == piece) { - dest += n == 0 ? "::" : ":"; - ignore0 = true; - continue; - } - char buf[5]; - snprintf(buf, sizeof(buf), "%x", *piece); - dest += buf; - if (n < 7) - dest += ':'; - } - dest += ']'; + case kHash: { + out.set_hash(new_value_view); break; } - case HostType::H_FAILED: + case kHost: { + out.set_host(new_value_view); break; - } - return dest; -} - -bool ParseHost(const std::string& input, - std::string* output, - bool is_special, - bool unicode = false) { - if (input.empty()) { - output->clear(); - return true; - } - URLHost host; - host.ParseHost(input.c_str(), input.length(), is_special, unicode); - if (host.ParsingFailed()) - return false; - *output = host.ToStringMove(); - return true; -} - -std::vector FromJSStringArray(Environment* env, - Local array) { - std::vector vec; - if (array->Length() > 0) - vec.reserve(array->Length()); - for (size_t n = 0; n < array->Length(); n++) { - Local val = array->Get(env->context(), n).ToLocalChecked(); - if (val->IsString()) { - Utf8Value value(env->isolate(), val.As()); - vec.emplace_back(*value, value.length()); - } - } - return vec; -} - -url_data HarvestBase(Environment* env, Local base_obj) { - url_data base; - Local context = env->context(); - - Local flags = - base_obj->Get(env->context(), env->flags_string()).ToLocalChecked(); - if (flags->IsInt32()) - base.flags = flags->Int32Value(context).FromJust(); - - Local port = - base_obj->Get(env->context(), env->port_string()).ToLocalChecked(); - if (port->IsInt32()) - base.port = port->Int32Value(context).FromJust(); - - Local scheme = - base_obj->Get(env->context(), env->scheme_string()).ToLocalChecked(); - base.scheme = Utf8Value(env->isolate(), scheme).out(); - - auto GetStr = [&](std::string url_data::*member, - int flag, - Local name, - bool empty_as_present) { - Local value = base_obj->Get(env->context(), name).ToLocalChecked(); - if (value->IsString()) { - Utf8Value utf8value(env->isolate(), value.As()); - (base.*member).assign(*utf8value, utf8value.length()); - if (empty_as_present || value.As()->Length() != 0) { - base.flags |= flag; - } - } - }; - GetStr(&url_data::username, - URL_FLAGS_HAS_USERNAME, - env->username_string(), - false); - GetStr(&url_data::password, - URL_FLAGS_HAS_PASSWORD, - env->password_string(), - false); - GetStr(&url_data::host, URL_FLAGS_HAS_HOST, env->host_string(), true); - GetStr(&url_data::query, URL_FLAGS_HAS_QUERY, env->query_string(), true); - GetStr(&url_data::fragment, - URL_FLAGS_HAS_FRAGMENT, - env->fragment_string(), - true); - - Local - path = base_obj->Get(env->context(), env->path_string()).ToLocalChecked(); - if (path->IsArray()) { - base.flags |= URL_FLAGS_HAS_PATH; - base.path = FromJSStringArray(env, path.As()); - } - return base; -} - -url_data HarvestContext(Environment* env, Local context_obj) { - url_data context; - Local flags = - context_obj->Get(env->context(), env->flags_string()).ToLocalChecked(); - if (flags->IsInt32()) { - static constexpr int32_t kCopyFlagsMask = - URL_FLAGS_SPECIAL | - URL_FLAGS_CANNOT_BE_BASE | - URL_FLAGS_HAS_USERNAME | - URL_FLAGS_HAS_PASSWORD | - URL_FLAGS_HAS_HOST; - context.flags |= flags.As()->Value() & kCopyFlagsMask; - } - Local scheme = - context_obj->Get(env->context(), env->scheme_string()).ToLocalChecked(); - if (scheme->IsString()) { - Utf8Value value(env->isolate(), scheme); - context.scheme.assign(*value, value.length()); - } - Local port = - context_obj->Get(env->context(), env->port_string()).ToLocalChecked(); - if (port->IsInt32()) - context.port = port.As()->Value(); - if (context.flags & URL_FLAGS_HAS_USERNAME) { - Local username = - context_obj->Get(env->context(), - env->username_string()).ToLocalChecked(); - CHECK(username->IsString()); - Utf8Value value(env->isolate(), username); - context.username.assign(*value, value.length()); - } - if (context.flags & URL_FLAGS_HAS_PASSWORD) { - Local password = - context_obj->Get(env->context(), - env->password_string()).ToLocalChecked(); - CHECK(password->IsString()); - Utf8Value value(env->isolate(), password); - context.password.assign(*value, value.length()); - } - Local host = - context_obj->Get(env->context(), - env->host_string()).ToLocalChecked(); - if (host->IsString()) { - Utf8Value value(env->isolate(), host); - context.host.assign(*value, value.length()); - } - return context; -} - -// Single dot segment can be ".", "%2e", or "%2E" -bool IsSingleDotSegment(const std::string& str) { - switch (str.size()) { - case 1: - return str == "."; - case 3: - return str[0] == '%' && - str[1] == '2' && - ASCIILowercase(str[2]) == 'e'; - default: - return false; - } -} - -// Double dot segment can be: -// "..", ".%2e", ".%2E", "%2e.", "%2E.", -// "%2e%2e", "%2E%2E", "%2e%2E", or "%2E%2e" -bool IsDoubleDotSegment(const std::string& str) { - switch (str.size()) { - case 2: - return str == ".."; - case 4: - if (str[0] != '.' && str[0] != '%') - return false; - return ((str[0] == '.' && - str[1] == '%' && - str[2] == '2' && - ASCIILowercase(str[3]) == 'e') || - (str[0] == '%' && - str[1] == '2' && - ASCIILowercase(str[2]) == 'e' && - str[3] == '.')); - case 6: - return (str[0] == '%' && - str[1] == '2' && - ASCIILowercase(str[2]) == 'e' && - str[3] == '%' && - str[4] == '2' && - ASCIILowercase(str[5]) == 'e'); - default: - return false; - } -} - -void ShortenUrlPath(struct url_data* url) { - if (url->path.empty()) return; - if (url->path.size() == 1 && url->scheme == "file:" && - IsNormalizedWindowsDriveLetter(url->path[0])) return; - url->path.pop_back(); -} - -} // anonymous namespace - -void URL::Parse(const char* input, - size_t len, - enum url_parse_state state_override, - struct url_data* url, - bool has_url, - const struct url_data* base, - bool has_base) { - const char* p = input; - const char* end = input + len; - - if (!has_url) { - for (const char* ptr = p; ptr < end; ptr++) { - if (IsC0ControlOrSpace(*ptr)) - p++; - else - break; } - for (const char* ptr = end - 1; ptr >= p; ptr--) { - if (IsC0ControlOrSpace(*ptr)) - end--; - else - break; + case kHostname: { + out.set_hostname(new_value_view); + break; } - input = p; - len = end - p; - } - - // The spec says we should strip out any ASCII tabs or newlines. - // In those cases, we create another std::string instance with the filtered - // contents, but in the general case we avoid the overhead. - std::string whitespace_stripped; - for (const char* ptr = p; ptr < end; ptr++) { - if (!IsASCIITabOrNewline(*ptr)) - continue; - // Hit tab or newline. Allocate storage, copy what we have until now, - // and then iterate and filter all similar characters out. - whitespace_stripped.reserve(len - 1); - whitespace_stripped.assign(p, ptr - p); - // 'ptr + 1' skips the current char, which we know to be tab or newline. - for (ptr = ptr + 1; ptr < end; ptr++) { - if (!IsASCIITabOrNewline(*ptr)) - whitespace_stripped += *ptr; + case kHref: { + out.set_href(new_value_view); + break; } - - // Update variables like they should have looked like if the string - // had been stripped of whitespace to begin with. - input = whitespace_stripped.c_str(); - len = whitespace_stripped.size(); - p = input; - end = input + len; - break; - } - - bool atflag = false; // Set when @ has been seen. - bool square_bracket_flag = false; // Set inside of [...] - bool password_token_seen_flag = false; // Set after a : after an username. - - std::string buffer; - - // Set the initial parse state. - const bool has_state_override = state_override != kUnknownState; - enum url_parse_state state = has_state_override ? state_override : - kSchemeStart; - - if (state < kSchemeStart || state > kFragment) { - url->flags |= URL_FLAGS_INVALID_PARSE_STATE; - return; - } - - while (p <= end) { - const char ch = p < end ? p[0] : kEOL; - bool special = (url->flags & URL_FLAGS_SPECIAL); - bool cannot_be_base; - bool special_back_slash = (special && ch == '\\'); - - switch (state) { - case kSchemeStart: - if (IsASCIIAlpha(ch)) { - buffer += ASCIILowercase(ch); - state = kScheme; - } else if (!has_state_override) { - state = kNoScheme; - continue; - } else { - url->flags |= URL_FLAGS_FAILED; - return; - } - break; - case kScheme: - if (IsASCIIAlphanumeric(ch) || ch == '+' || ch == '-' || ch == '.') { - buffer += ASCIILowercase(ch); - } else if (ch == ':' || (has_state_override && ch == kEOL)) { - if (has_state_override && buffer.size() == 0) { - url->flags |= URL_FLAGS_TERMINATED; - return; - } - buffer += ':'; - - bool new_is_special = IsSpecial(buffer); - - if (has_state_override) { - if ((special != new_is_special) || - ((buffer == "file:") && - ((url->flags & URL_FLAGS_HAS_USERNAME) || - (url->flags & URL_FLAGS_HAS_PASSWORD) || - (url->port != -1))) || - (url->scheme == "file:" && url->host.empty())) { - url->flags |= URL_FLAGS_TERMINATED; - return; - } - } - - url->scheme = std::move(buffer); - url->port = NormalizePort(url->scheme, url->port); - if (new_is_special) { - url->flags |= URL_FLAGS_SPECIAL; - special = true; - } else { - url->flags &= ~URL_FLAGS_SPECIAL; - special = false; - } - // `special_back_slash` equals to `(special && ch == '\\')` and `ch` - // here always not equals to `\\`. So `special_back_slash` here always - // equals to `false`. - special_back_slash = false; - buffer.clear(); - if (has_state_override) - return; - if (url->scheme == "file:") { - state = kFile; - } else if (special && - has_base && - url->scheme == base->scheme) { - state = kSpecialRelativeOrAuthority; - } else if (special) { - state = kSpecialAuthoritySlashes; - } else if (p + 1 < end && p[1] == '/') { - state = kPathOrAuthority; - p++; - } else { - url->flags |= URL_FLAGS_CANNOT_BE_BASE; - url->flags |= URL_FLAGS_HAS_PATH; - url->path.emplace_back(""); - state = kCannotBeBase; - } - } else if (!has_state_override) { - buffer.clear(); - state = kNoScheme; - p = input; - continue; - } else { - url->flags |= URL_FLAGS_FAILED; - return; - } - break; - case kNoScheme: - cannot_be_base = has_base && (base->flags & URL_FLAGS_CANNOT_BE_BASE); - if (!has_base || (cannot_be_base && ch != '#')) { - url->flags |= URL_FLAGS_FAILED; - return; - } else if (cannot_be_base && ch == '#') { - url->scheme = base->scheme; - if (IsSpecial(url->scheme)) { - url->flags |= URL_FLAGS_SPECIAL; - special = true; - } else { - url->flags &= ~URL_FLAGS_SPECIAL; - special = false; - } - special_back_slash = (special && ch == '\\'); - if (base->flags & URL_FLAGS_HAS_PATH) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path = base->path; - } - if (base->flags & URL_FLAGS_HAS_QUERY) { - url->flags |= URL_FLAGS_HAS_QUERY; - url->query = base->query; - } - if (base->flags & URL_FLAGS_HAS_FRAGMENT) { - url->flags |= URL_FLAGS_HAS_FRAGMENT; - url->fragment = base->fragment; - } - url->flags |= URL_FLAGS_CANNOT_BE_BASE; - state = kFragment; - } else if (has_base && - base->scheme != "file:") { - state = kRelative; - continue; - } else { - url->scheme = "file:"; - url->flags |= URL_FLAGS_SPECIAL; - special = true; - state = kFile; - special_back_slash = (special && ch == '\\'); - continue; - } - break; - case kSpecialRelativeOrAuthority: - if (ch == '/' && p + 1 < end && p[1] == '/') { - state = kSpecialAuthorityIgnoreSlashes; - p++; - } else { - state = kRelative; - continue; - } - break; - case kPathOrAuthority: - if (ch == '/') { - state = kAuthority; - } else { - state = kPath; - continue; - } - break; - case kRelative: - url->scheme = base->scheme; - if (IsSpecial(url->scheme)) { - url->flags |= URL_FLAGS_SPECIAL; - special = true; - } else { - url->flags &= ~URL_FLAGS_SPECIAL; - special = false; - } - special_back_slash = (special && ch == '\\'); - switch (ch) { - case kEOL: - if (base->flags & URL_FLAGS_HAS_USERNAME) { - url->flags |= URL_FLAGS_HAS_USERNAME; - url->username = base->username; - } - if (base->flags & URL_FLAGS_HAS_PASSWORD) { - url->flags |= URL_FLAGS_HAS_PASSWORD; - url->password = base->password; - } - if (base->flags & URL_FLAGS_HAS_HOST) { - url->flags |= URL_FLAGS_HAS_HOST; - url->host = base->host; - } - if (base->flags & URL_FLAGS_HAS_QUERY) { - url->flags |= URL_FLAGS_HAS_QUERY; - url->query = base->query; - } - if (base->flags & URL_FLAGS_HAS_PATH) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path = base->path; - } - url->port = base->port; - break; - case '/': - state = kRelativeSlash; - break; - case '?': - if (base->flags & URL_FLAGS_HAS_USERNAME) { - url->flags |= URL_FLAGS_HAS_USERNAME; - url->username = base->username; - } - if (base->flags & URL_FLAGS_HAS_PASSWORD) { - url->flags |= URL_FLAGS_HAS_PASSWORD; - url->password = base->password; - } - if (base->flags & URL_FLAGS_HAS_HOST) { - url->flags |= URL_FLAGS_HAS_HOST; - url->host = base->host; - } - if (base->flags & URL_FLAGS_HAS_PATH) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path = base->path; - } - url->port = base->port; - state = kQuery; - break; - case '#': - if (base->flags & URL_FLAGS_HAS_USERNAME) { - url->flags |= URL_FLAGS_HAS_USERNAME; - url->username = base->username; - } - if (base->flags & URL_FLAGS_HAS_PASSWORD) { - url->flags |= URL_FLAGS_HAS_PASSWORD; - url->password = base->password; - } - if (base->flags & URL_FLAGS_HAS_HOST) { - url->flags |= URL_FLAGS_HAS_HOST; - url->host = base->host; - } - if (base->flags & URL_FLAGS_HAS_QUERY) { - url->flags |= URL_FLAGS_HAS_QUERY; - url->query = base->query; - } - if (base->flags & URL_FLAGS_HAS_PATH) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path = base->path; - } - url->port = base->port; - state = kFragment; - break; - default: - if (special_back_slash) { - state = kRelativeSlash; - } else { - if (base->flags & URL_FLAGS_HAS_USERNAME) { - url->flags |= URL_FLAGS_HAS_USERNAME; - url->username = base->username; - } - if (base->flags & URL_FLAGS_HAS_PASSWORD) { - url->flags |= URL_FLAGS_HAS_PASSWORD; - url->password = base->password; - } - if (base->flags & URL_FLAGS_HAS_HOST) { - url->flags |= URL_FLAGS_HAS_HOST; - url->host = base->host; - } - if (base->flags & URL_FLAGS_HAS_PATH) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path = base->path; - ShortenUrlPath(url); - } - url->port = base->port; - state = kPath; - continue; - } - } - break; - case kRelativeSlash: - if (IsSpecial(url->scheme) && (ch == '/' || ch == '\\')) { - state = kSpecialAuthorityIgnoreSlashes; - } else if (ch == '/') { - state = kAuthority; - } else { - if (base->flags & URL_FLAGS_HAS_USERNAME) { - url->flags |= URL_FLAGS_HAS_USERNAME; - url->username = base->username; - } - if (base->flags & URL_FLAGS_HAS_PASSWORD) { - url->flags |= URL_FLAGS_HAS_PASSWORD; - url->password = base->password; - } - if (base->flags & URL_FLAGS_HAS_HOST) { - url->flags |= URL_FLAGS_HAS_HOST; - url->host = base->host; - } - url->port = base->port; - state = kPath; - continue; - } - break; - case kSpecialAuthoritySlashes: - state = kSpecialAuthorityIgnoreSlashes; - if (ch == '/' && p + 1 < end && p[1] == '/') { - p++; - } else { - continue; - } - break; - case kSpecialAuthorityIgnoreSlashes: - if (ch != '/' && ch != '\\') { - state = kAuthority; - continue; - } - break; - case kAuthority: - if (ch == '@') { - if (atflag) { - buffer.reserve(buffer.size() + 3); - buffer.insert(0, "%40"); - } - atflag = true; - size_t blen = buffer.size(); - if (blen > 0 && buffer[0] != ':') { - url->flags |= URL_FLAGS_HAS_USERNAME; - } - for (size_t n = 0; n < blen; n++) { - const char bch = buffer[n]; - if (bch == ':') { - url->flags |= URL_FLAGS_HAS_PASSWORD; - if (!password_token_seen_flag) { - password_token_seen_flag = true; - continue; - } - } - if (password_token_seen_flag) { - AppendOrEscape(&url->password, bch, USERINFO_ENCODE_SET); - } else { - AppendOrEscape(&url->username, bch, USERINFO_ENCODE_SET); - } - } - buffer.clear(); - } else if (ch == kEOL || - ch == '/' || - ch == '?' || - ch == '#' || - special_back_slash) { - if (atflag && buffer.size() == 0) { - url->flags |= URL_FLAGS_FAILED; - return; - } - p -= buffer.size() + 1; - buffer.clear(); - state = kHost; - } else { - buffer += ch; - } - break; - case kHost: - case kHostname: - if (has_state_override && url->scheme == "file:") { - state = kFileHost; - continue; - } else if (ch == ':' && !square_bracket_flag) { - if (buffer.size() == 0) { - url->flags |= URL_FLAGS_FAILED; - return; - } - if (state_override == kHostname) { - return; - } - url->flags |= URL_FLAGS_HAS_HOST; - if (!ParseHost(buffer, &url->host, special)) { - url->flags |= URL_FLAGS_FAILED; - return; - } - buffer.clear(); - state = kPort; - } else if (ch == kEOL || - ch == '/' || - ch == '?' || - ch == '#' || - special_back_slash) { - p--; - if (special && buffer.size() == 0) { - url->flags |= URL_FLAGS_FAILED; - return; - } - if (has_state_override && - buffer.size() == 0 && - ((url->username.size() > 0 || url->password.size() > 0) || - url->port != -1)) { - url->flags |= URL_FLAGS_TERMINATED; - return; - } - url->flags |= URL_FLAGS_HAS_HOST; - if (!ParseHost(buffer, &url->host, special)) { - url->flags |= URL_FLAGS_FAILED; - return; - } - buffer.clear(); - state = kPathStart; - if (has_state_override) { - return; - } - } else { - if (ch == '[') - square_bracket_flag = true; - if (ch == ']') - square_bracket_flag = false; - buffer += ch; - } - break; - case kPort: - if (IsASCIIDigit(ch)) { - buffer += ch; - } else if (has_state_override || - ch == kEOL || - ch == '/' || - ch == '?' || - ch == '#' || - special_back_slash) { - if (buffer.size() > 0) { - unsigned port = 0; - // the condition port <= 0xffff prevents integer overflow - for (size_t i = 0; port <= 0xffff && i < buffer.size(); i++) - port = port * 10 + buffer[i] - '0'; - if (port > 0xffff) { - // TODO(TimothyGu): This hack is currently needed for the host - // setter since it needs access to hostname if it is valid, and - // if the FAILED flag is set the entire response to JS layer - // will be empty. - if (state_override == kHost) - url->port = -1; - else - url->flags |= URL_FLAGS_FAILED; - return; - } - // the port is valid - url->port = NormalizePort(url->scheme, static_cast(port)); - if (url->port == -1) - url->flags |= URL_FLAGS_IS_DEFAULT_SCHEME_PORT; - buffer.clear(); - } else if (has_state_override) { - // TODO(TimothyGu): Similar case as above. - if (state_override == kHost) - url->port = -1; - else - url->flags |= URL_FLAGS_TERMINATED; - return; - } - state = kPathStart; - continue; - } else { - url->flags |= URL_FLAGS_FAILED; - return; - } - break; - case kFile: - url->scheme = "file:"; - url->host.clear(); - url->flags |= URL_FLAGS_HAS_HOST; - if (ch == '/' || ch == '\\') { - state = kFileSlash; - } else if (has_base && base->scheme == "file:") { - switch (ch) { - case kEOL: - if (base->flags & URL_FLAGS_HAS_HOST) { - url->host = base->host; - } - if (base->flags & URL_FLAGS_HAS_PATH) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path = base->path; - } - if (base->flags & URL_FLAGS_HAS_QUERY) { - url->flags |= URL_FLAGS_HAS_QUERY; - url->query = base->query; - } - break; - case '?': - if (base->flags & URL_FLAGS_HAS_HOST) { - url->host = base->host; - } - if (base->flags & URL_FLAGS_HAS_PATH) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path = base->path; - } - url->flags |= URL_FLAGS_HAS_QUERY; - url->query.clear(); - state = kQuery; - break; - case '#': - if (base->flags & URL_FLAGS_HAS_HOST) { - url->host = base->host; - } - if (base->flags & URL_FLAGS_HAS_PATH) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path = base->path; - } - if (base->flags & URL_FLAGS_HAS_QUERY) { - url->flags |= URL_FLAGS_HAS_QUERY; - url->query = base->query; - } - url->flags |= URL_FLAGS_HAS_FRAGMENT; - url->fragment.clear(); - state = kFragment; - break; - default: - url->query.clear(); - if (base->flags & URL_FLAGS_HAS_HOST) { - url->host = base->host; - } - if (base->flags & URL_FLAGS_HAS_PATH) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path = base->path; - } - if (!StartsWithWindowsDriveLetter(p, end)) { - ShortenUrlPath(url); - } else { - url->path.clear(); - } - state = kPath; - continue; - } - } else { - state = kPath; - continue; - } - break; - case kFileSlash: - if (ch == '/' || ch == '\\') { - state = kFileHost; - } else { - if (has_base && base->scheme == "file:") { - url->flags |= URL_FLAGS_HAS_HOST; - url->host = base->host; - if (!StartsWithWindowsDriveLetter(p, end) && - IsNormalizedWindowsDriveLetter(base->path[0])) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path.push_back(base->path[0]); - } - } - state = kPath; - continue; - } - break; - case kFileHost: - if (ch == kEOL || - ch == '/' || - ch == '\\' || - ch == '?' || - ch == '#') { - if (!has_state_override && - buffer.size() == 2 && - IsWindowsDriveLetter(buffer)) { - state = kPath; - } else if (buffer.size() == 0) { - url->flags |= URL_FLAGS_HAS_HOST; - url->host.clear(); - if (has_state_override) - return; - state = kPathStart; - } else { - std::string host; - if (!ParseHost(buffer, &host, special)) { - url->flags |= URL_FLAGS_FAILED; - return; - } - if (host == "localhost") - host.clear(); - url->flags |= URL_FLAGS_HAS_HOST; - url->host = host; - if (has_state_override) - return; - buffer.clear(); - state = kPathStart; - } - continue; - } else { - buffer += ch; - } - break; - case kPathStart: - if (IsSpecial(url->scheme)) { - state = kPath; - if (ch != '/' && ch != '\\') { - continue; - } - } else if (!has_state_override && ch == '?') { - url->flags |= URL_FLAGS_HAS_QUERY; - url->query.clear(); - state = kQuery; - } else if (!has_state_override && ch == '#') { - url->flags |= URL_FLAGS_HAS_FRAGMENT; - url->fragment.clear(); - state = kFragment; - } else if (ch != kEOL) { - state = kPath; - if (ch != '/') { - continue; - } - } else if (has_state_override && !(url->flags & URL_FLAGS_HAS_HOST)) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path.emplace_back(""); - } - break; - case kPath: - if (ch == kEOL || - ch == '/' || - special_back_slash || - (!has_state_override && (ch == '?' || ch == '#'))) { - if (IsDoubleDotSegment(buffer)) { - ShortenUrlPath(url); - if (ch != '/' && !special_back_slash) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path.emplace_back(""); - } - } else if (IsSingleDotSegment(buffer) && - ch != '/' && !special_back_slash) { - url->flags |= URL_FLAGS_HAS_PATH; - url->path.emplace_back(""); - } else if (!IsSingleDotSegment(buffer)) { - if (url->scheme == "file:" && - url->path.empty() && - buffer.size() == 2 && - IsWindowsDriveLetter(buffer)) { - buffer[1] = ':'; - } - url->flags |= URL_FLAGS_HAS_PATH; - url->path.emplace_back(std::move(buffer)); - } - buffer.clear(); - if (ch == '?') { - url->flags |= URL_FLAGS_HAS_QUERY; - url->query.clear(); - state = kQuery; - } else if (ch == '#') { - url->flags |= URL_FLAGS_HAS_FRAGMENT; - url->fragment.clear(); - state = kFragment; - } - } else { - AppendOrEscape(&buffer, ch, PATH_ENCODE_SET); - } - break; - case kCannotBeBase: - switch (ch) { - case '?': - state = kQuery; - break; - case '#': - state = kFragment; - break; - default: - if (url->path.empty()) - url->path.emplace_back(""); - else if (ch != kEOL) - AppendOrEscape(&url->path[0], ch, C0_CONTROL_ENCODE_SET); - } - break; - case kQuery: - if (ch == kEOL || (!has_state_override && ch == '#')) { - url->flags |= URL_FLAGS_HAS_QUERY; - url->query = std::move(buffer); - buffer.clear(); - if (ch == '#') - state = kFragment; - } else { - AppendOrEscape(&buffer, ch, special ? QUERY_ENCODE_SET_SPECIAL : - QUERY_ENCODE_SET_NONSPECIAL); - } - break; - case kFragment: - switch (ch) { - case kEOL: - url->flags |= URL_FLAGS_HAS_FRAGMENT; - url->fragment = std::move(buffer); - break; - default: - AppendOrEscape(&buffer, ch, FRAGMENT_ENCODE_SET); - } - break; - default: - url->flags |= URL_FLAGS_INVALID_PARSE_STATE; - return; + case kPassword: { + out.set_password(new_value_view); + break; } - - p++; - } -} // NOLINT(readability/fn_size) - -// https://url.spec.whatwg.org/#url-serializing -std::string URL::SerializeURL(const url_data& url, - bool exclude = false) { - std::string output; - output.reserve( - 10 + // We generally insert < 10 separator characters between URL parts - url.scheme.size() + - url.username.size() + - url.password.size() + - url.host.size() + - url.query.size() + - url.fragment.size() + - url.href.size() + - std::accumulate( - url.path.begin(), - url.path.end(), - 0, - [](size_t sum, const auto& str) { return sum + str.size(); })); - - output += url.scheme; - if (url.flags & URL_FLAGS_HAS_HOST) { - output += "//"; - if (url.flags & URL_FLAGS_HAS_USERNAME || - url.flags & URL_FLAGS_HAS_PASSWORD) { - if (url.flags & URL_FLAGS_HAS_USERNAME) { - output += url.username; - } - if (url.flags & URL_FLAGS_HAS_PASSWORD) { - output += ":" + url.password; - } - output += "@"; + case kPort: { + out.set_port(new_value_view); + break; } - output += url.host; - if (url.port != -1) { - output += ":" + std::to_string(url.port); + case kProtocol: { + out.set_protocol(new_value_view); + break; } - } - if (url.flags & URL_FLAGS_CANNOT_BE_BASE) { - output += url.path[0]; - } else { - if (!(url.flags & URL_FLAGS_HAS_HOST) && - url.path.size() > 1 && - url.path[0].empty()) { - output += "/."; + case kSearch: { + out.set_search(new_value_view); + break; } - for (size_t i = 1; i < url.path.size(); i++) { - output += "/" + url.path[i]; + case kUsername: { + out.set_username(new_value_view); + break; } } - if (url.flags & URL_FLAGS_HAS_QUERY) { - output += "?" + url.query; - } - if (!exclude && (url.flags & URL_FLAGS_HAS_FRAGMENT)) { - output += "#" + url.fragment; - } - output.shrink_to_fit(); - return output; -} -namespace { -void SetArgs(Environment* env, - Local argv[ARG_COUNT], - const struct url_data& url) { - Isolate* isolate = env->isolate(); - argv[ARG_FLAGS] = Integer::NewFromUnsigned(isolate, url.flags); - argv[ARG_PROTOCOL] = - url.flags & URL_FLAGS_SPECIAL ? - GetSpecial(env, url.scheme) : - OneByteString(isolate, url.scheme.c_str()); - if (url.flags & URL_FLAGS_HAS_USERNAME) - argv[ARG_USERNAME] = Utf8String(isolate, url.username); - if (url.flags & URL_FLAGS_HAS_PASSWORD) - argv[ARG_PASSWORD] = Utf8String(isolate, url.password); - if (url.flags & URL_FLAGS_HAS_HOST) - argv[ARG_HOST] = Utf8String(isolate, url.host); - if (url.flags & URL_FLAGS_HAS_QUERY) - argv[ARG_QUERY] = Utf8String(isolate, url.query); - if (url.flags & URL_FLAGS_HAS_FRAGMENT) - argv[ARG_FRAGMENT] = Utf8String(isolate, url.fragment); - if (url.port > -1) - argv[ARG_PORT] = Integer::New(isolate, url.port); - if (url.flags & URL_FLAGS_HAS_PATH) - argv[ARG_PATH] = ToV8Value(env->context(), url.path).ToLocalChecked(); -} - -void Parse(Environment* env, - Local recv, - const char* input, - size_t len, - enum url_parse_state state_override, - Local base_obj, - Local context_obj, - Local cb, - Local error_cb) { - Isolate* isolate = env->isolate(); - Local context = env->context(); - HandleScope handle_scope(isolate); - Context::Scope context_scope(context); - - const bool has_context = context_obj->IsObject(); - const bool has_base = base_obj->IsObject(); - - url_data base; - url_data url; - if (has_context) - url = HarvestContext(env, context_obj.As()); - if (has_base) - base = HarvestBase(env, base_obj.As()); - - URL::Parse(input, len, state_override, &url, has_context, &base, has_base); - if ((url.flags & URL_FLAGS_INVALID_PARSE_STATE) || - ((state_override != kUnknownState) && - (url.flags & URL_FLAGS_TERMINATED))) - return; - - // Define the return value placeholders const Local undef = Undefined(isolate); - const Local null = Null(isolate); - if (!(url.flags & URL_FLAGS_FAILED)) { - Local argv[] = { + Local argv[] = { undef, undef, undef, undef, - null, // host defaults to null - null, // port defaults to null undef, - null, // query defaults to null - null, // fragment defaults to null - }; - SetArgs(env, argv, url); - USE(cb->Call(context, recv, arraysize(argv), argv)); - } else if (error_cb->IsFunction()) { - Local flags = Integer::NewFromUnsigned(isolate, url.flags); - USE(error_cb.As()->Call(context, recv, 1, &flags)); - } -} - -void Parse(const FunctionCallbackInfo& args) { - Environment* env = Environment::GetCurrent(args); - CHECK_GE(args.Length(), 5); - CHECK(args[0]->IsString()); // input - CHECK(args[2]->IsUndefined() || // base context - args[2]->IsNull() || - args[2]->IsObject()); - CHECK(args[3]->IsUndefined() || // context - args[3]->IsNull() || - args[3]->IsObject()); - CHECK(args[4]->IsFunction()); // complete callback - CHECK(args[5]->IsUndefined() || args[5]->IsFunction()); // error callback - - Utf8Value input(env->isolate(), args[0]); - enum url_parse_state state_override = kUnknownState; - if (args[1]->IsNumber()) { - state_override = static_cast( - args[1]->Uint32Value(env->context()).FromJust()); - } - - Parse(env, args.This(), - *input, input.length(), - state_override, - args[2], - args[3], - args[4].As(), - args[5]); -} - -void EncodeAuthSet(const FunctionCallbackInfo& args) { - Environment* env = Environment::GetCurrent(args); - CHECK_GE(args.Length(), 1); - CHECK(args[0]->IsString()); - Utf8Value value(env->isolate(), args[0]); - std::string output; - size_t len = value.length(); - output.reserve(len); - for (size_t n = 0; n < len; n++) { - const char ch = (*value)[n]; - AppendOrEscape(&output, ch, USERINFO_ENCODE_SET); - } - args.GetReturnValue().Set( - String::NewFromUtf8(env->isolate(), output.c_str()).ToLocalChecked()); -} - -void DomainToASCII(const FunctionCallbackInfo& args) { - Environment* env = Environment::GetCurrent(args); - CHECK_GE(args.Length(), 1); - CHECK(args[0]->IsString()); - Utf8Value value(env->isolate(), args[0]); - - URLHost host; - // Assuming the host is used for a special scheme. - host.ParseHost(*value, value.length(), true); - if (host.ParsingFailed()) { - args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), "")); - return; - } - std::string out = host.ToStringMove(); - args.GetReturnValue().Set( - String::NewFromUtf8(env->isolate(), out.c_str()).ToLocalChecked()); -} - -void DomainToUnicode(const FunctionCallbackInfo& args) { - Environment* env = Environment::GetCurrent(args); - CHECK_GE(args.Length(), 1); - CHECK(args[0]->IsString()); - Utf8Value value(env->isolate(), args[0]); - - URLHost host; - // Assuming the host is used for a special scheme. - host.ParseHost(*value, value.length(), true, true); - if (host.ParsingFailed()) { - args.GetReturnValue().Set(FIXED_ONE_BYTE_STRING(env->isolate(), "")); - return; - } - std::string out = host.ToStringMove(); - args.GetReturnValue().Set( - String::NewFromUtf8(env->isolate(), out.c_str()).ToLocalChecked()); -} - -void SetURLConstructor(const FunctionCallbackInfo& args) { - Environment* env = Environment::GetCurrent(args); - CHECK_EQ(args.Length(), 1); - CHECK(args[0]->IsFunction()); - env->set_url_constructor_function(args[0].As()); + undef, + undef, + undef, + undef, + undef, + undef, + }; + SetArgs(env, argv, out); + USE(success_callback_->Call( + env->context(), args.This(), arraysize(argv), argv)); } void Initialize(Local target, @@ -1836,144 +254,19 @@ void Initialize(Local target, Local context, void* priv) { SetMethod(context, target, "parse", Parse); - SetMethodNoSideEffect(context, target, "encodeAuth", EncodeAuthSet); + SetMethod(context, target, "updateUrl", UpdateUrl); + SetMethodNoSideEffect(context, target, "domainToASCII", DomainToASCII); SetMethodNoSideEffect(context, target, "domainToUnicode", DomainToUnicode); - SetMethod(context, target, "setURLConstructor", SetURLConstructor); - -#define XX(name, _) NODE_DEFINE_CONSTANT(target, name); - FLAGS(XX) -#undef XX - -#define XX(name) NODE_DEFINE_CONSTANT(target, name); - PARSESTATES(XX) -#undef XX } } // namespace void RegisterExternalReferences(ExternalReferenceRegistry* registry) { registry->Register(Parse); - registry->Register(EncodeAuthSet); + registry->Register(UpdateUrl); + registry->Register(DomainToASCII); registry->Register(DomainToUnicode); - registry->Register(SetURLConstructor); -} - -std::string URL::ToFilePath() const { - if (context_.scheme != "file:") { - return ""; - } - -#ifdef _WIN32 - const char* slash = "\\"; - auto is_slash = [] (char ch) { - return ch == '/' || ch == '\\'; - }; -#else - const char* slash = "/"; - auto is_slash = [] (char ch) { - return ch == '/'; - }; - if ((context_.flags & URL_FLAGS_HAS_HOST) && - context_.host.length() > 0) { - return ""; - } -#endif - std::string decoded_path; - for (const std::string& part : context_.path) { - std::string decoded = PercentDecode(part.c_str(), part.length()); - for (char& ch : decoded) { - if (is_slash(ch)) { - return ""; - } - } - decoded_path += slash + decoded; - } - -#ifdef _WIN32 - // TODO(TimothyGu): Use "\\?\" long paths on Windows. - - // If hostname is set, then we have a UNC path. Pass the hostname through - // ToUnicode just in case it is an IDN using punycode encoding. We do not - // need to worry about percent encoding because the URL parser will have - // already taken care of that for us. Note that this only causes IDNs with an - // appropriate `xn--` prefix to be decoded. - if ((context_.flags & URL_FLAGS_HAS_HOST) && - context_.host.length() > 0) { - std::string unicode_host; - if (!ToUnicode(context_.host, &unicode_host)) { - return ""; - } - return "\\\\" + unicode_host + decoded_path; - } - // Otherwise, it's a local path that requires a drive letter. - if (decoded_path.length() < 3) { - return ""; - } - if (decoded_path[2] != ':' || - !IsASCIIAlpha(decoded_path[1])) { - return ""; - } - // Strip out the leading '\'. - return decoded_path.substr(1); -#else - return decoded_path; -#endif -} - -URL URL::FromFilePath(const std::string& file_path) { - URL url("file://"); - std::string escaped_file_path; - for (size_t i = 0; i < file_path.length(); ++i) { - escaped_file_path += file_path[i]; - if (file_path[i] == '%') - escaped_file_path += "25"; - } - URL::Parse(escaped_file_path.c_str(), escaped_file_path.length(), kPathStart, - &url.context_, true, nullptr, false); - return url; -} - -// This function works by calling out to a JS function that creates and -// returns the JS URL object. Be mindful of the JS<->Native boundary -// crossing that is required. -MaybeLocal URL::ToObject(Environment* env) const { - Isolate* isolate = env->isolate(); - Local context = env->context(); - Context::Scope context_scope(context); - - const Local undef = Undefined(isolate); - const Local null = Null(isolate); - - if (context_.flags & URL_FLAGS_FAILED) - return Local(); - - Local argv[] = { - undef, - undef, - undef, - undef, - null, // host defaults to null - null, // port defaults to null - undef, - null, // query defaults to null - null, // fragment defaults to null - }; - SetArgs(env, argv, context_); - - MaybeLocal ret; - { - TryCatchScope try_catch(env, TryCatchScope::CatchMode::kFatal); - - // The SetURLConstructor method must have been called already to - // set the constructor function used below. SetURLConstructor is - // called automatically when the internal/url.js module is loaded - // during the internal/bootstrap/node.js processing. - ret = env->url_constructor_function() - ->Call(env->context(), undef, arraysize(argv), argv); - } - - return ret; } } // namespace url diff --git a/src/node_url.h b/src/node_url.h index d7b9a1c368cdae..23df6c85adfa5e 100644 --- a/src/node_url.h +++ b/src/node_url.h @@ -3,6 +3,7 @@ #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS +#include "ada.h" #include "node.h" #include @@ -10,191 +11,16 @@ namespace node { namespace url { -#define PARSESTATES(XX) \ - XX(kSchemeStart) \ - XX(kScheme) \ - XX(kNoScheme) \ - XX(kSpecialRelativeOrAuthority) \ - XX(kPathOrAuthority) \ - XX(kRelative) \ - XX(kRelativeSlash) \ - XX(kSpecialAuthoritySlashes) \ - XX(kSpecialAuthorityIgnoreSlashes) \ - XX(kAuthority) \ - XX(kHost) \ - XX(kHostname) \ - XX(kPort) \ - XX(kFile) \ - XX(kFileSlash) \ - XX(kFileHost) \ - XX(kPathStart) \ - XX(kPath) \ - XX(kCannotBeBase) \ - XX(kQuery) \ - XX(kFragment) - -#define FLAGS(XX) \ - XX(URL_FLAGS_NONE, 0) \ - XX(URL_FLAGS_FAILED, 0x01) \ - XX(URL_FLAGS_CANNOT_BE_BASE, 0x02) \ - XX(URL_FLAGS_INVALID_PARSE_STATE, 0x04) \ - XX(URL_FLAGS_TERMINATED, 0x08) \ - XX(URL_FLAGS_SPECIAL, 0x10) \ - XX(URL_FLAGS_HAS_USERNAME, 0x20) \ - XX(URL_FLAGS_HAS_PASSWORD, 0x40) \ - XX(URL_FLAGS_HAS_HOST, 0x80) \ - XX(URL_FLAGS_HAS_PATH, 0x100) \ - XX(URL_FLAGS_HAS_QUERY, 0x200) \ - XX(URL_FLAGS_HAS_FRAGMENT, 0x400) \ - XX(URL_FLAGS_IS_DEFAULT_SCHEME_PORT, 0x800) \ - -enum url_parse_state { - kUnknownState = -1, -#define XX(name) name, - PARSESTATES(XX) -#undef XX -}; - -enum url_flags { -#define XX(name, val) name = val, - FLAGS(XX) -#undef XX -}; - -struct url_data { - int32_t flags = URL_FLAGS_NONE; - int port = -1; - std::string scheme; - std::string username; - std::string password; - std::string host; - std::string query; - std::string fragment; - std::vector path; - std::string href; -}; - -namespace table_data { -extern const char hex[1024]; -extern const uint8_t C0_CONTROL_ENCODE_SET[32]; -extern const uint8_t FRAGMENT_ENCODE_SET[32]; -extern const uint8_t PATH_ENCODE_SET[32]; -extern const uint8_t USERINFO_ENCODE_SET[32]; -extern const uint8_t QUERY_ENCODE_SET_NONSPECIAL[32]; -extern const uint8_t QUERY_ENCODE_SET_SPECIAL[32]; -} - -class URL { - public: - static void Parse(const char* input, - size_t len, - enum url_parse_state state_override, - struct url_data* url, - bool has_url, - const struct url_data* base, - bool has_base); - - static std::string SerializeURL(const url_data& url, bool exclude); - - URL(const char* input, const size_t len) { - Parse(input, len, kUnknownState, &context_, false, nullptr, false); - } - - URL(const char* input, const size_t len, const URL* base) { - if (base != nullptr) - Parse(input, len, kUnknownState, - &context_, false, - &(base->context_), true); - else - Parse(input, len, kUnknownState, &context_, false, nullptr, false); +static ada::url FromFilePath(const std::string& file_path) { + ada::url url = ada::parse("file://"); + std::string escaped_file_path; + for (size_t i = 0; i < file_path.length(); ++i) { + escaped_file_path += file_path[i]; + if (file_path[i] == '%') escaped_file_path += "25"; } - - URL(const char* input, const size_t len, - const char* base, const size_t baselen) { - if (base != nullptr && baselen > 0) { - URL _base(base, baselen); - Parse(input, len, kUnknownState, - &context_, false, - &(_base.context_), true); - } else { - Parse(input, len, kUnknownState, &context_, false, nullptr, false); - } - } - - explicit URL(const std::string& input) : - URL(input.c_str(), input.length()) {} - - URL(const std::string& input, const URL* base) : - URL(input.c_str(), input.length(), base) {} - - URL(const std::string& input, const URL& base) : - URL(input.c_str(), input.length(), &base) {} - - URL(const std::string& input, const std::string& base) : - URL(input.c_str(), input.length(), base.c_str(), base.length()) {} - - int32_t flags() const { - return context_.flags; - } - - int port() const { - return context_.port; - } - - const std::string& protocol() const { - return context_.scheme; - } - - const std::string& username() const { - return context_.username; - } - - const std::string& password() const { - return context_.password; - } - - const std::string& host() const { - return context_.host; - } - - const std::string& query() const { - return context_.query; - } - - const std::string& fragment() const { - return context_.fragment; - } - - std::string path() const { - std::string ret; - for (const std::string& element : context_.path) { - ret += '/' + element; - } - return ret; - } - - std::string href() const { - return SerializeURL(context_, false); - } - - // Get the path of the file: URL in a format consumable by native file system - // APIs. Returns an empty string if something went wrong. - std::string ToFilePath() const; - // Get the file URL from native file system path. - static URL FromFilePath(const std::string& file_path); - - v8::MaybeLocal ToObject(Environment* env) const; - - URL(const URL&) = default; - URL& operator=(const URL&) = default; - URL(URL&&) = default; - URL& operator=(URL&&) = default; - - URL() : URL("") {} - - private: - url_data context_; -}; + url.set_pathname(escaped_file_path); + return url; +} } // namespace url diff --git a/src/node_url_tables.cc b/src/node_url_tables.cc deleted file mode 100644 index 801badf838dc83..00000000000000 --- a/src/node_url_tables.cc +++ /dev/null @@ -1,448 +0,0 @@ -#include -#include "node_url.h" - -namespace node { -namespace url { -namespace table_data { - -const char hex[1024] = - "%00\0%01\0%02\0%03\0%04\0%05\0%06\0%07\0" - "%08\0%09\0%0A\0%0B\0%0C\0%0D\0%0E\0%0F\0" - "%10\0%11\0%12\0%13\0%14\0%15\0%16\0%17\0" - "%18\0%19\0%1A\0%1B\0%1C\0%1D\0%1E\0%1F\0" - "%20\0%21\0%22\0%23\0%24\0%25\0%26\0%27\0" - "%28\0%29\0%2A\0%2B\0%2C\0%2D\0%2E\0%2F\0" - "%30\0%31\0%32\0%33\0%34\0%35\0%36\0%37\0" - "%38\0%39\0%3A\0%3B\0%3C\0%3D\0%3E\0%3F\0" - "%40\0%41\0%42\0%43\0%44\0%45\0%46\0%47\0" - "%48\0%49\0%4A\0%4B\0%4C\0%4D\0%4E\0%4F\0" - "%50\0%51\0%52\0%53\0%54\0%55\0%56\0%57\0" - "%58\0%59\0%5A\0%5B\0%5C\0%5D\0%5E\0%5F\0" - "%60\0%61\0%62\0%63\0%64\0%65\0%66\0%67\0" - "%68\0%69\0%6A\0%6B\0%6C\0%6D\0%6E\0%6F\0" - "%70\0%71\0%72\0%73\0%74\0%75\0%76\0%77\0" - "%78\0%79\0%7A\0%7B\0%7C\0%7D\0%7E\0%7F\0" - "%80\0%81\0%82\0%83\0%84\0%85\0%86\0%87\0" - "%88\0%89\0%8A\0%8B\0%8C\0%8D\0%8E\0%8F\0" - "%90\0%91\0%92\0%93\0%94\0%95\0%96\0%97\0" - "%98\0%99\0%9A\0%9B\0%9C\0%9D\0%9E\0%9F\0" - "%A0\0%A1\0%A2\0%A3\0%A4\0%A5\0%A6\0%A7\0" - "%A8\0%A9\0%AA\0%AB\0%AC\0%AD\0%AE\0%AF\0" - "%B0\0%B1\0%B2\0%B3\0%B4\0%B5\0%B6\0%B7\0" - "%B8\0%B9\0%BA\0%BB\0%BC\0%BD\0%BE\0%BF\0" - "%C0\0%C1\0%C2\0%C3\0%C4\0%C5\0%C6\0%C7\0" - "%C8\0%C9\0%CA\0%CB\0%CC\0%CD\0%CE\0%CF\0" - "%D0\0%D1\0%D2\0%D3\0%D4\0%D5\0%D6\0%D7\0" - "%D8\0%D9\0%DA\0%DB\0%DC\0%DD\0%DE\0%DF\0" - "%E0\0%E1\0%E2\0%E3\0%E4\0%E5\0%E6\0%E7\0" - "%E8\0%E9\0%EA\0%EB\0%EC\0%ED\0%EE\0%EF\0" - "%F0\0%F1\0%F2\0%F3\0%F4\0%F5\0%F6\0%F7\0" - "%F8\0%F9\0%FA\0%FB\0%FC\0%FD\0%FE\0%FF"; - -const uint8_t C0_CONTROL_ENCODE_SET[32] = { - // 00 01 02 03 04 05 06 07 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 08 09 0A 0B 0C 0D 0E 0F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 10 11 12 13 14 15 16 17 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 18 19 1A 1B 1C 1D 1E 1F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 20 21 22 23 24 25 26 27 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 28 29 2A 2B 2C 2D 2E 2F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 30 31 32 33 34 35 36 37 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 38 39 3A 3B 3C 3D 3E 3F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 40 41 42 43 44 45 46 47 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 48 49 4A 4B 4C 4D 4E 4F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 50 51 52 53 54 55 56 57 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 58 59 5A 5B 5C 5D 5E 5F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 60 61 62 63 64 65 66 67 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 68 69 6A 6B 6C 6D 6E 6F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 70 71 72 73 74 75 76 77 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 78 79 7A 7B 7C 7D 7E 7F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, - // 80 81 82 83 84 85 86 87 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 88 89 8A 8B 8C 8D 8E 8F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 90 91 92 93 94 95 96 97 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 98 99 9A 9B 9C 9D 9E 9F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A0 A1 A2 A3 A4 A5 A6 A7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A8 A9 AA AB AC AD AE AF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B0 B1 B2 B3 B4 B5 B6 B7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B8 B9 BA BB BC BD BE BF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C0 C1 C2 C3 C4 C5 C6 C7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C8 C9 CA CB CC CD CE CF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D0 D1 D2 D3 D4 D5 D6 D7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D8 D9 DA DB DC DD DE DF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E0 E1 E2 E3 E4 E5 E6 E7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E8 E9 EA EB EC ED EE EF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F0 F1 F2 F3 F4 F5 F6 F7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F8 F9 FA FB FC FD FE FF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 -}; - -const uint8_t FRAGMENT_ENCODE_SET[32] = { - // 00 01 02 03 04 05 06 07 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 08 09 0A 0B 0C 0D 0E 0F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 10 11 12 13 14 15 16 17 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 18 19 1A 1B 1C 1D 1E 1F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 20 21 22 23 24 25 26 27 - 0x01 | 0x00 | 0x04 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 28 29 2A 2B 2C 2D 2E 2F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 30 31 32 33 34 35 36 37 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 38 39 3A 3B 3C 3D 3E 3F - 0x00 | 0x00 | 0x00 | 0x00 | 0x10 | 0x00 | 0x40 | 0x00, - // 40 41 42 43 44 45 46 47 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 48 49 4A 4B 4C 4D 4E 4F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 50 51 52 53 54 55 56 57 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 58 59 5A 5B 5C 5D 5E 5F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 60 61 62 63 64 65 66 67 - 0x01 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 68 69 6A 6B 6C 6D 6E 6F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 70 71 72 73 74 75 76 77 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 78 79 7A 7B 7C 7D 7E 7F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, - // 80 81 82 83 84 85 86 87 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 88 89 8A 8B 8C 8D 8E 8F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 90 91 92 93 94 95 96 97 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 98 99 9A 9B 9C 9D 9E 9F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A0 A1 A2 A3 A4 A5 A6 A7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A8 A9 AA AB AC AD AE AF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B0 B1 B2 B3 B4 B5 B6 B7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B8 B9 BA BB BC BD BE BF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C0 C1 C2 C3 C4 C5 C6 C7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C8 C9 CA CB CC CD CE CF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D0 D1 D2 D3 D4 D5 D6 D7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D8 D9 DA DB DC DD DE DF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E0 E1 E2 E3 E4 E5 E6 E7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E8 E9 EA EB EC ED EE EF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F0 F1 F2 F3 F4 F5 F6 F7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F8 F9 FA FB FC FD FE FF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 -}; - - -const uint8_t PATH_ENCODE_SET[32] = { - // 00 01 02 03 04 05 06 07 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 08 09 0A 0B 0C 0D 0E 0F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 10 11 12 13 14 15 16 17 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 18 19 1A 1B 1C 1D 1E 1F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 20 21 22 23 24 25 26 27 - 0x01 | 0x00 | 0x04 | 0x08 | 0x00 | 0x00 | 0x00 | 0x00, - // 28 29 2A 2B 2C 2D 2E 2F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 30 31 32 33 34 35 36 37 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 38 39 3A 3B 3C 3D 3E 3F - 0x00 | 0x00 | 0x00 | 0x00 | 0x10 | 0x00 | 0x40 | 0x80, - // 40 41 42 43 44 45 46 47 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 48 49 4A 4B 4C 4D 4E 4F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 50 51 52 53 54 55 56 57 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 58 59 5A 5B 5C 5D 5E 5F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 60 61 62 63 64 65 66 67 - 0x01 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 68 69 6A 6B 6C 6D 6E 6F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 70 71 72 73 74 75 76 77 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 78 79 7A 7B 7C 7D 7E 7F - 0x00 | 0x00 | 0x00 | 0x08 | 0x00 | 0x20 | 0x00 | 0x80, - // 80 81 82 83 84 85 86 87 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 88 89 8A 8B 8C 8D 8E 8F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 90 91 92 93 94 95 96 97 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 98 99 9A 9B 9C 9D 9E 9F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A0 A1 A2 A3 A4 A5 A6 A7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A8 A9 AA AB AC AD AE AF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B0 B1 B2 B3 B4 B5 B6 B7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B8 B9 BA BB BC BD BE BF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C0 C1 C2 C3 C4 C5 C6 C7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C8 C9 CA CB CC CD CE CF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D0 D1 D2 D3 D4 D5 D6 D7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D8 D9 DA DB DC DD DE DF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E0 E1 E2 E3 E4 E5 E6 E7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E8 E9 EA EB EC ED EE EF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F0 F1 F2 F3 F4 F5 F6 F7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F8 F9 FA FB FC FD FE FF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 -}; - -const uint8_t USERINFO_ENCODE_SET[32] = { - // 00 01 02 03 04 05 06 07 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 08 09 0A 0B 0C 0D 0E 0F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 10 11 12 13 14 15 16 17 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 18 19 1A 1B 1C 1D 1E 1F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 20 21 22 23 24 25 26 27 - 0x01 | 0x00 | 0x04 | 0x08 | 0x00 | 0x00 | 0x00 | 0x00, - // 28 29 2A 2B 2C 2D 2E 2F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, - // 30 31 32 33 34 35 36 37 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 38 39 3A 3B 3C 3D 3E 3F - 0x00 | 0x00 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 40 41 42 43 44 45 46 47 - 0x01 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 48 49 4A 4B 4C 4D 4E 4F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 50 51 52 53 54 55 56 57 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 58 59 5A 5B 5C 5D 5E 5F - 0x00 | 0x00 | 0x00 | 0x08 | 0x10 | 0x20 | 0x40 | 0x00, - // 60 61 62 63 64 65 66 67 - 0x01 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 68 69 6A 6B 6C 6D 6E 6F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 70 71 72 73 74 75 76 77 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 78 79 7A 7B 7C 7D 7E 7F - 0x00 | 0x00 | 0x00 | 0x08 | 0x10 | 0x20 | 0x00 | 0x80, - // 80 81 82 83 84 85 86 87 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 88 89 8A 8B 8C 8D 8E 8F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 90 91 92 93 94 95 96 97 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 98 99 9A 9B 9C 9D 9E 9F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A0 A1 A2 A3 A4 A5 A6 A7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A8 A9 AA AB AC AD AE AF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B0 B1 B2 B3 B4 B5 B6 B7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B8 B9 BA BB BC BD BE BF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C0 C1 C2 C3 C4 C5 C6 C7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C8 C9 CA CB CC CD CE CF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D0 D1 D2 D3 D4 D5 D6 D7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D8 D9 DA DB DC DD DE DF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E0 E1 E2 E3 E4 E5 E6 E7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E8 E9 EA EB EC ED EE EF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F0 F1 F2 F3 F4 F5 F6 F7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F8 F9 FA FB FC FD FE FF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 -}; - -const uint8_t QUERY_ENCODE_SET_NONSPECIAL[32] = { - // 00 01 02 03 04 05 06 07 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 08 09 0A 0B 0C 0D 0E 0F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 10 11 12 13 14 15 16 17 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 18 19 1A 1B 1C 1D 1E 1F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 20 21 22 23 24 25 26 27 - 0x01 | 0x00 | 0x04 | 0x08 | 0x00 | 0x00 | 0x00 | 0x00, - // 28 29 2A 2B 2C 2D 2E 2F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 30 31 32 33 34 35 36 37 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 38 39 3A 3B 3C 3D 3E 3F - 0x00 | 0x00 | 0x00 | 0x00 | 0x10 | 0x00 | 0x40 | 0x00, - // 40 41 42 43 44 45 46 47 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 48 49 4A 4B 4C 4D 4E 4F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 50 51 52 53 54 55 56 57 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 58 59 5A 5B 5C 5D 5E 5F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 60 61 62 63 64 65 66 67 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 68 69 6A 6B 6C 6D 6E 6F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 70 71 72 73 74 75 76 77 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 78 79 7A 7B 7C 7D 7E 7F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, - // 80 81 82 83 84 85 86 87 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 88 89 8A 8B 8C 8D 8E 8F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 90 91 92 93 94 95 96 97 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 98 99 9A 9B 9C 9D 9E 9F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A0 A1 A2 A3 A4 A5 A6 A7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A8 A9 AA AB AC AD AE AF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B0 B1 B2 B3 B4 B5 B6 B7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B8 B9 BA BB BC BD BE BF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C0 C1 C2 C3 C4 C5 C6 C7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C8 C9 CA CB CC CD CE CF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D0 D1 D2 D3 D4 D5 D6 D7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D8 D9 DA DB DC DD DE DF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E0 E1 E2 E3 E4 E5 E6 E7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E8 E9 EA EB EC ED EE EF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F0 F1 F2 F3 F4 F5 F6 F7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F8 F9 FA FB FC FD FE FF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 -}; - -// Same as QUERY_ENCODE_SET_NONSPECIAL, but with 0x27 (') encoded. -const uint8_t QUERY_ENCODE_SET_SPECIAL[32] = { - // 00 01 02 03 04 05 06 07 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 08 09 0A 0B 0C 0D 0E 0F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 10 11 12 13 14 15 16 17 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 18 19 1A 1B 1C 1D 1E 1F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 20 21 22 23 24 25 26 27 - 0x01 | 0x00 | 0x04 | 0x08 | 0x00 | 0x00 | 0x00 | 0x80, - // 28 29 2A 2B 2C 2D 2E 2F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 30 31 32 33 34 35 36 37 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 38 39 3A 3B 3C 3D 3E 3F - 0x00 | 0x00 | 0x00 | 0x00 | 0x10 | 0x00 | 0x40 | 0x00, - // 40 41 42 43 44 45 46 47 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 48 49 4A 4B 4C 4D 4E 4F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 50 51 52 53 54 55 56 57 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 58 59 5A 5B 5C 5D 5E 5F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 60 61 62 63 64 65 66 67 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 68 69 6A 6B 6C 6D 6E 6F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 70 71 72 73 74 75 76 77 - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, - // 78 79 7A 7B 7C 7D 7E 7F - 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, - // 80 81 82 83 84 85 86 87 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 88 89 8A 8B 8C 8D 8E 8F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 90 91 92 93 94 95 96 97 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // 98 99 9A 9B 9C 9D 9E 9F - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A0 A1 A2 A3 A4 A5 A6 A7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // A8 A9 AA AB AC AD AE AF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B0 B1 B2 B3 B4 B5 B6 B7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // B8 B9 BA BB BC BD BE BF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C0 C1 C2 C3 C4 C5 C6 C7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // C8 C9 CA CB CC CD CE CF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D0 D1 D2 D3 D4 D5 D6 D7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // D8 D9 DA DB DC DD DE DF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E0 E1 E2 E3 E4 E5 E6 E7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // E8 E9 EA EB EC ED EE EF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F0 F1 F2 F3 F4 F5 F6 F7 - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, - // F8 F9 FA FB FC FD FE FF - 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 -}; - -} // namespace table_data -} // namespace url -} // namespace node diff --git a/test/cctest/test_url.cc b/test/cctest/test_url.cc deleted file mode 100644 index f2430b3d506ac1..00000000000000 --- a/test/cctest/test_url.cc +++ /dev/null @@ -1,218 +0,0 @@ -#include "node_url.h" -#include "node_i18n.h" -#include "util-inl.h" - -#include "gtest/gtest.h" - -using node::url::URL; -using node::url::URL_FLAGS_FAILED; - -class URLTest : public ::testing::Test { - protected: - void SetUp() override { -#if defined(NODE_HAVE_I18N_SUPPORT) - std::string icu_data_dir; - node::i18n::InitializeICUDirectory(icu_data_dir); -#endif - } - - void TearDown() override {} -}; - -TEST_F(URLTest, Simple) { - URL simple("https://example.org:81/a/b/c?query#fragment"); - - EXPECT_FALSE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "https:"); - EXPECT_EQ(simple.host(), "example.org"); - EXPECT_EQ(simple.port(), 81); - EXPECT_EQ(simple.path(), "/a/b/c"); - EXPECT_EQ(simple.query(), "query"); - EXPECT_EQ(simple.fragment(), "fragment"); -} - -TEST_F(URLTest, Simple2) { - const char* input = "https://example.org:81/a/b/c?query#fragment"; - URL simple(input, strlen(input)); - - EXPECT_FALSE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "https:"); - EXPECT_EQ(simple.host(), "example.org"); - EXPECT_EQ(simple.port(), 81); - EXPECT_EQ(simple.path(), "/a/b/c"); - EXPECT_EQ(simple.query(), "query"); - EXPECT_EQ(simple.fragment(), "fragment"); -} - -TEST_F(URLTest, ForbiddenHostCodePoint) { - URL error("https://exa|mple.org:81/a/b/c?query#fragment"); - EXPECT_TRUE(error.flags() & URL_FLAGS_FAILED); -} - -TEST_F(URLTest, NoBase1) { - URL error("123noscheme"); - EXPECT_TRUE(error.flags() & URL_FLAGS_FAILED); -} - -TEST_F(URLTest, Base1) { - URL base("http://example.org/foo/bar"); - ASSERT_FALSE(base.flags() & URL_FLAGS_FAILED); - - URL simple("../baz", &base); - EXPECT_FALSE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "http:"); - EXPECT_EQ(simple.host(), "example.org"); - EXPECT_EQ(simple.path(), "/baz"); -} - -TEST_F(URLTest, Base2) { - URL simple("../baz", "http://example.org/foo/bar"); - - EXPECT_FALSE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "http:"); - EXPECT_EQ(simple.host(), "example.org"); - EXPECT_EQ(simple.path(), "/baz"); -} - -TEST_F(URLTest, Base3) { - const char* input = "../baz"; - const char* base = "http://example.org/foo/bar"; - - URL simple(input, strlen(input), base, strlen(base)); - - EXPECT_FALSE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "http:"); - EXPECT_EQ(simple.host(), "example.org"); - EXPECT_EQ(simple.path(), "/baz"); -} - -TEST_F(URLTest, Base4) { - const char* input = "\\x"; - const char* base = "http://example.org/foo/bar"; - - URL simple(input, strlen(input), base, strlen(base)); - - EXPECT_FALSE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "http:"); - EXPECT_EQ(simple.host(), "example.org"); - EXPECT_EQ(simple.path(), "/x"); -} - -TEST_F(URLTest, Base5) { - const char* input = "/x"; - const char* base = "http://example.org/foo/bar"; - - URL simple(input, strlen(input), base, strlen(base)); - - EXPECT_FALSE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "http:"); - EXPECT_EQ(simple.host(), "example.org"); - EXPECT_EQ(simple.path(), "/x"); -} - -TEST_F(URLTest, Base6) { - const char* input = "\\\\x"; - const char* base = "http://example.org/foo/bar"; - - URL simple(input, strlen(input), base, strlen(base)); - - EXPECT_FALSE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "http:"); - EXPECT_EQ(simple.host(), "x"); -} - -TEST_F(URLTest, Base7) { - const char* input = "//x"; - const char* base = "http://example.org/foo/bar"; - - URL simple(input, strlen(input), base, strlen(base)); - - EXPECT_FALSE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "http:"); - EXPECT_EQ(simple.host(), "x"); -} - -TEST_F(URLTest, TruncatedAfterProtocol) { - char input[2] = { 'q', ':' }; - URL simple(input, sizeof(input)); - - EXPECT_FALSE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "q:"); - EXPECT_EQ(simple.host(), ""); - EXPECT_EQ(simple.path(), "/"); -} - -TEST_F(URLTest, TruncatedAfterProtocol2) { - char input[6] = { 'h', 't', 't', 'p', ':', '/' }; - URL simple(input, sizeof(input)); - - EXPECT_TRUE(simple.flags() & URL_FLAGS_FAILED); - EXPECT_EQ(simple.protocol(), "http:"); - EXPECT_EQ(simple.host(), ""); - EXPECT_EQ(simple.path(), ""); -} - -TEST_F(URLTest, ToFilePath) { -#define T(url, path) EXPECT_EQ(path, URL(url).ToFilePath()) - T("http://example.org/foo/bar", ""); - -#ifdef _WIN32 - T("file:///C:/Program%20Files/", "C:\\Program Files\\"); - T("file:///C:/a/b/c?query#fragment", "C:\\a\\b\\c"); - T("file://host/path/a/b/c?query#fragment", "\\\\host\\path\\a\\b\\c"); -#if defined(NODE_HAVE_I18N_SUPPORT) - T("file://xn--weird-prdj8vva.com/host/a", "\\\\wͪ͊eiͬ͋rd.com\\host\\a"); -#else - T("file://xn--weird-prdj8vva.com/host/a", - "\\\\xn--weird-prdj8vva.com\\host\\a"); -#endif - T("file:///C:/a%2Fb", ""); - T("file:///", ""); - T("file:///home", ""); -#else - T("file:///", "/"); - T("file:///home/user?query#fragment", "/home/user"); - T("file:///home/user/?query#fragment", "/home/user/"); - T("file:///home/user/%20space", "/home/user/ space"); - T("file:///home/us%5Cer", "/home/us\\er"); - T("file:///home/us%2Fer", ""); - T("file://host/path", ""); -#endif - -#undef T -} - -TEST_F(URLTest, FromFilePath) { - URL file_url; -#ifdef _WIN32 - file_url = URL::FromFilePath("C:\\Program Files\\"); - EXPECT_EQ("file:", file_url.protocol()); - EXPECT_EQ("//C:/Program%20Files/", file_url.path()); - EXPECT_EQ("file:///C:/Program%20Files/", file_url.href()); - - file_url = URL::FromFilePath("C:\\a\\b\\c"); - EXPECT_EQ("file:", file_url.protocol()); - EXPECT_EQ("//C:/a/b/c", file_url.path()); - EXPECT_EQ("file:///C:/a/b/c", file_url.href()); - - file_url = URL::FromFilePath("b:\\a\\%%.js"); - EXPECT_EQ("file:", file_url.protocol()); - EXPECT_EQ("//b:/a/%25%25.js", file_url.path()); - EXPECT_EQ("file:///b:/a/%25%25.js", file_url.href()); -#else - file_url = URL::FromFilePath("/"); - EXPECT_EQ("file:", file_url.protocol()); - EXPECT_EQ("//", file_url.path()); - EXPECT_EQ("file:///", file_url.href()); - - file_url = URL::FromFilePath("/a/b/c"); - EXPECT_EQ("file:", file_url.protocol()); - EXPECT_EQ("//a/b/c", file_url.path()); - EXPECT_EQ("file:///a/b/c", file_url.href()); - - file_url = URL::FromFilePath("/a/%%.js"); - EXPECT_EQ("file:", file_url.protocol()); - EXPECT_EQ("//a/%25%25.js", file_url.path()); - EXPECT_EQ("file:///a/%25%25.js", file_url.href()); -#endif -} diff --git a/test/fuzzers/fuzz_url.cc b/test/fuzzers/fuzz_url.cc deleted file mode 100644 index 16c5f644893f86..00000000000000 --- a/test/fuzzers/fuzz_url.cc +++ /dev/null @@ -1,11 +0,0 @@ -#include - -#include "node.h" -#include "node_internals.h" -#include "node_url.h" - -extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - node::url::URL url2(reinterpret_cast(data), size); - - return 0; -} diff --git a/test/parallel/test-whatwg-url-custom-inspect.js b/test/parallel/test-whatwg-url-custom-inspect.js index ad77f5725d30ed..7c762a192564ba 100644 --- a/test/parallel/test-whatwg-url-custom-inspect.js +++ b/test/parallel/test-whatwg-url-custom-inspect.js @@ -45,18 +45,18 @@ assert.strictEqual( search: '?que=ry', searchParams: URLSearchParams { 'que' => 'ry' }, hash: '#hash', - cannotBeBase: false, - special: true, [Symbol(context)]: URLContext { - flags: 2032, - scheme: 'https:', + href: 'https://username:password@host.name:8080/path/name/?que=ry#hash', + origin: 'https://host.name:8080', + protocol: 'https:', + host: 'host.name:8080', + hostname: 'host.name', + pathname: '/path/name/', + search: '?que=ry', username: 'username', password: 'password', - host: 'host.name', - port: 8080, - path: [ 'path', 'name', '', [length]: 3 ], - query: 'que=ry', - fragment: 'hash' + port: '8080', + hash: '#hash' } }`); diff --git a/tools/license-builder.sh b/tools/license-builder.sh index 8389f24c681082..1b52a473a15bf2 100755 --- a/tools/license-builder.sh +++ b/tools/license-builder.sh @@ -81,6 +81,8 @@ licenseText="$(sed -e '/The data format used by the zlib library/,$d' -e 's/^\/\ addlicense "zlib" "deps/zlib" "$licenseText" licenseText="$(cat "${rootdir}/deps/simdutf/LICENSE-MIT")" addlicense "simdutf" "deps/simdutf" "$licenseText" +licenseText="$(curl -sL https://raw.githubusercontent.com/ada-url/ada/HEAD/LICENSE-MIT)" +addlicense "ada" "deps/ada" "$licenseText" # npm licenseText="$(cat "${rootdir}/deps/npm/LICENSE")"