From 9fff747803e1589e8b8c5bf5b8fc7c1d951cff41 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 1 Apr 2023 14:59:29 -0400 Subject: [PATCH 01/10] Ok. --- CMakeLists.txt | 1 + cmake/import.cmake | 52 ++++++++++++++++++++++++++++++++++ tests/CMakeLists.txt | 7 +++-- tests/bitset_container_unit.c | 10 +++---- tools/cmake/FindCTargets.cmake | 10 ++++--- tools/cmake/Import.cmake | 32 +++++++++++++++------ 6 files changed, 92 insertions(+), 20 deletions(-) create mode 100644 cmake/import.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 56dbd3f2e..21011042f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,6 +98,7 @@ configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/tests/config.h.in" add_subdirectory(src) if(ENABLE_ROARING_TESTS) + add_subdirectory(microbenchmarks) add_subdirectory(benchmarks) add_subdirectory(tests) endif() diff --git a/cmake/import.cmake b/cmake/import.cmake new file mode 100644 index 000000000..a9b6ffe5d --- /dev/null +++ b/cmake/import.cmake @@ -0,0 +1,52 @@ +set(dep_root "${PROJEC_SOURCE_DIR}/dependencies/.cache") +if(DEFINED ENV{roaring_DEPENDENCY_CACHE_DIR}) + set(dep_root "$ENV{roaring_DEPENDENCY_CACHE_DIR}") +endif() + +function(import_dependency NAME GITHUB_REPO COMMIT) + message(STATUS "Importing ${NAME} (${GITHUB_REPO}@${COMMIT})") + set(target "${dep_root}/${NAME}") + + # If the folder exists in the cache, then we assume that everything is as + # should be and do nothing + if(EXISTS "${target}") + set("${NAME}_SOURCE_DIR" "${target}" PARENT_SCOPE) + return() + endif() + + set(zip_url "https://github.com/${GITHUB_REPO}/archive/${COMMIT}.zip") + set(archive "${dep_root}/archive.zip") + set(dest "${dep_root}/_extract") + + file(DOWNLOAD "${zip_url}" "${archive}") + file(MAKE_DIRECTORY "${dest}") + execute_process( + WORKING_DIRECTORY "${dest}" + COMMAND "${CMAKE_COMMAND}" -E tar xf "${archive}") + file(REMOVE "${archive}") + + # GitHub archives only ever have one folder component at the root, so this + # will always match that single folder + file(GLOB dir LIST_DIRECTORIES YES "${dest}/*") + + file(RENAME "${dir}" "${target}") + + set("${NAME}_SOURCE_DIR" "${target}" PARENT_SCOPE) +endfunction() + +# Delegates to the dependency +macro(add_dependency NAME) + if(NOT DEFINED "${NAME}_SOURCE_DIR") + message(FATAL_ERROR "Missing ${NAME}_SOURCE_DIR variable") + endif() + + add_subdirectory("${${NAME}_SOURCE_DIR}" "${PROJECT_BINARY_DIR}/_deps/${NAME}" EXCLUDE_FROM_ALL) +endmacro() + +function(set_off NAME) + set("${NAME}" OFF CACHE INTERNAL "") +endfunction() + +function(set_on NAME) + set("${NAME}" ON CACHE INTERNAL "") +endfunction() \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 41fe0e6c2..875793b22 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -41,7 +41,8 @@ endif() configure_file(${CMAKE_SOURCE_DIR}/tools/cmake/CTestCustom.cmake ${CMAKE_BINARY_DIR}) -set(BUILD_STATIC_LIB ON) -import_dependency(vendor/cmocka https://cmocka.org/files/1.1/cmocka-1.1.5.tar.xz) -add_subdirectory(vendor/cmocka) +#set(BUILD_STATIC_LIB ON) + +#import_dependency(vendor/cmocka https://cmocka.org/files/1.1/cmocka-1.1.5.tar.xz) +#add_subdirectory(vendor/cmocka) diff --git a/tests/bitset_container_unit.c b/tests/bitset_container_unit.c index b92279893..4e3f1bc97 100644 --- a/tests/bitset_container_unit.c +++ b/tests/bitset_container_unit.c @@ -116,8 +116,8 @@ DEFINE_TEST(and_or_test) { size_t max_value = 60000; - size_t b1_count = 0; - size_t bi_count = 0; + int b1_count = 0; + int bi_count = 0; for (size_t x = 0; x < max_value; x += 3) { bitset_container_set(B1, x); bitset_container_set(BI, x); @@ -132,7 +132,7 @@ DEFINE_TEST(and_or_test) { assert_true(bitset_container_compute_cardinality(B1) == b1_count); assert_true(bitset_container_compute_cardinality(BI) == bi_count); - size_t b2_count = 0; + int b2_count = 0; // important: 62 is not divisible by 3 for (size_t x = 0; x < max_value; x += 62) { bi_count += !bitset_container_get(BI, x); @@ -144,7 +144,7 @@ DEFINE_TEST(and_or_test) { assert_true(bitset_container_compute_cardinality(B2) == b2_count); assert_true(bitset_container_compute_cardinality(BI) == bi_count); - size_t bo_count = 0; + int bo_count = 0; for (size_t x = 0; x < max_value; x += 62 * 3) { bitset_container_set(BO, x); bo_count++; @@ -168,7 +168,7 @@ DEFINE_TEST(and_or_test) { bitset_container_printf(B1); // does it crash? bitset_container_printf(B2); // does it crash? bitset_container_printf(BI); // does it crash? - size_t interc = 0; + int interc = 0; for (size_t x = 0; x < max_value; x ++) { bool in1 = bitset_container_get(B1, x); bool in2 = bitset_container_get(B2, x); diff --git a/tools/cmake/FindCTargets.cmake b/tools/cmake/FindCTargets.cmake index 97f2b64d6..a65821424 100644 --- a/tools/cmake/FindCTargets.cmake +++ b/tools/cmake/FindCTargets.cmake @@ -1,6 +1,10 @@ if (CMAKE_VERSION VERSION_GREATER 3.0.0) cmake_policy(VERSION 3.0.0) endif () +include(${PROJECT_SOURCE_DIR}/tools/cmake/Import.cmake) + +import_dependency(cmocka clibs/cmocka ec387ac76d0ce9eece7cb8f523fca79f0e417ac8) +add_dependency(cmocka) function(add_c_test TEST_NAME) if(ROARING_BUILD_C_TESTS_AS_CPP) # under C++, container_t* != void* @@ -9,8 +13,7 @@ function(add_c_test TEST_NAME) add_executable(${TEST_NAME} ${TEST_NAME}.c) - include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/vendor/cmocka) - target_link_libraries(${TEST_NAME} ${ROARING_LIB_NAME} cmocka-static) + target_link_libraries(${TEST_NAME} ${ROARING_LIB_NAME} cmocka::cmocka) add_test(${TEST_NAME} ${TEST_NAME}) endfunction(add_c_test) @@ -26,8 +29,7 @@ if (CMAKE_VERSION VERSION_GREATER 2.8.10) endif() target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/cpp) - include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/vendor/cmocka) - target_link_libraries(${TEST_NAME} ${ROARING_LIB_NAME} cmocka-static) + target_link_libraries(${TEST_NAME} ${ROARING_LIB_NAME} cmocka::cmocka) add_test(${TEST_NAME} ${TEST_NAME}) endfunction(add_cpp_test) diff --git a/tools/cmake/Import.cmake b/tools/cmake/Import.cmake index a79ed8f58..2b4e3e755 100644 --- a/tools/cmake/Import.cmake +++ b/tools/cmake/Import.cmake @@ -1,10 +1,9 @@ -# Based on github.com/simdjson/simdjson/blob/master/dependencies/import.cmocka by @friendlyanon +set(dep_root "${PROJECT_SOURCE_DIR}/dependencies/.cache") -set(dep_root "${CMAKE_CURRENT_SOURCE_DIR}/.cache") -function(import_dependency NAME URL) - message(STATUS "Importing ${NAME} (${URL})") - set(target "${CMAKE_CURRENT_SOURCE_DIR}/${NAME}") +function(import_dependency NAME GITHUB_REPO COMMIT) + message(STATUS "Importing ${NAME} (${GITHUB_REPO}@${COMMIT})") + set(target "${dep_root}/${NAME}") # If the folder exists in the cache, then we assume that everything is as # should be and do nothing @@ -13,12 +12,12 @@ function(import_dependency NAME URL) return() endif() - set(archive "${dep_root}/archive.tar.xz") + set(zip_url "https://github.com/${GITHUB_REPO}/archive/${COMMIT}.zip") + set(archive "${dep_root}/archive.zip") set(dest "${dep_root}/_extract") - file(DOWNLOAD "${URL}" "${archive}") + file(DOWNLOAD "${zip_url}" "${archive}") file(MAKE_DIRECTORY "${dest}") - file(GLOB dir LIST_DIRECTORIES YES "${dep_root}/*") execute_process( WORKING_DIRECTORY "${dest}" COMMAND "${CMAKE_COMMAND}" -E tar xf "${archive}") @@ -32,3 +31,20 @@ function(import_dependency NAME URL) set("${NAME}_SOURCE_DIR" "${target}" PARENT_SCOPE) endfunction() + +# Delegates to the dependency +macro(add_dependency NAME) + if(NOT DEFINED "${NAME}_SOURCE_DIR") + message(FATAL_ERROR "Missing ${NAME}_SOURCE_DIR variable") + endif() + + add_subdirectory("${${NAME}_SOURCE_DIR}" "${PROJECT_BINARY_DIR}/_deps/${NAME}" EXCLUDE_FROM_ALL) +endmacro() + +function(set_off NAME) + set("${NAME}" OFF CACHE INTERNAL "") +endfunction() + +function(set_on NAME) + set("${NAME}" ON CACHE INTERNAL "") +endfunction() \ No newline at end of file From 5762327ce84ba574080c585f4e86f75a16e18660 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 1 Apr 2023 15:04:07 -0400 Subject: [PATCH 02/10] Adding dir --- microbenchmarks/CMakeLists.txt | 16 + microbenchmarks/bench.cpp | 156 +++ microbenchmarks/bench.h | 235 ++++ .../performancecounters/apple_arm_events.h | 1011 +++++++++++++++++ .../performancecounters/event_counter.h | 150 +++ microbenchmarks/performancecounters/ibireme.h | 917 +++++++++++++++ .../performancecounters/linux-perf-events.h | 101 ++ 7 files changed, 2586 insertions(+) create mode 100644 microbenchmarks/CMakeLists.txt create mode 100644 microbenchmarks/bench.cpp create mode 100644 microbenchmarks/bench.h create mode 100644 microbenchmarks/performancecounters/apple_arm_events.h create mode 100644 microbenchmarks/performancecounters/event_counter.h create mode 100644 microbenchmarks/performancecounters/ibireme.h create mode 100644 microbenchmarks/performancecounters/linux-perf-events.h diff --git a/microbenchmarks/CMakeLists.txt b/microbenchmarks/CMakeLists.txt new file mode 100644 index 000000000..ca8862df5 --- /dev/null +++ b/microbenchmarks/CMakeLists.txt @@ -0,0 +1,16 @@ + +set (BENCHMARK_DATA_DIR "${PROJECT_SOURCE_DIR}/benchmarks/realdata/") + +include(${PROJECT_SOURCE_DIR}/tools/cmake/Import.cmake) + +set_off(BENCHMARK_ENABLE_TESTING) +set_off(BENCHMARK_ENABLE_INSTALL) +set_off(BENCHMARK_ENABLE_WERROR) + +import_dependency(google_benchmarks google/benchmark f91b6b4) +add_dependency(google_benchmarks) + +add_executable(bench bench.cpp) +target_link_libraries(bench PRIVATE roaring) +target_link_libraries(bench PRIVATE benchmark::benchmark) +target_compile_definitions(bench PRIVATE BENCHMARK_DATA_DIR="${BENCHMARK_DATA_DIR}") diff --git a/microbenchmarks/bench.cpp b/microbenchmarks/bench.cpp new file mode 100644 index 000000000..2ea016d2d --- /dev/null +++ b/microbenchmarks/bench.cpp @@ -0,0 +1,156 @@ +#include "bench.h" + +struct compute_cardinality { + static uint64_t run() { + uint64_t marker = 0; + for (size_t i = 0; i < count; ++i) { + marker += roaring_bitmap_get_cardinality(bitmaps[i]); + } + return marker; + } +}; + +auto ComputeCardinality = BasicBench; +BENCHMARK(ComputeCardinality); + +struct successive_intersection { + static uint64_t run() { + uint64_t marker = 0; + for (size_t i = 0; i + 1 < count; ++i) { + roaring_bitmap_t *tempand = + roaring_bitmap_and(bitmaps[i], bitmaps[i + 1]); + marker += roaring_bitmap_get_cardinality(tempand); + roaring_bitmap_free(tempand); + } + return marker; + } +}; +auto SuccessiveIntersection = BasicBench; +BENCHMARK(SuccessiveIntersection); + +struct successive_union { + static uint64_t run() { + uint64_t marker = 0; + for (size_t i = 0; i + 1 < count; ++i) { + roaring_bitmap_t *tempand = + roaring_bitmap_or(bitmaps[i], bitmaps[i + 1]); + marker += roaring_bitmap_get_cardinality(tempand); + roaring_bitmap_free(tempand); + } + return marker; + } +}; +auto SuccessiveUnion = BasicBench; +BENCHMARK(SuccessiveUnion); + +struct many_union { + static uint64_t run() { + uint64_t marker = 0; + roaring_bitmap_t *totalorbitmap = + roaring_bitmap_or_many(count, (const roaring_bitmap_t **)bitmaps); + marker = roaring_bitmap_get_cardinality(totalorbitmap); + roaring_bitmap_free(totalorbitmap); + return marker; + } +}; +auto TotalUnion = BasicBench; +BENCHMARK(TotalUnion); + +struct many_union_heap { + static uint64_t run() { + uint64_t marker = 0; + roaring_bitmap_t *totalorbitmap = roaring_bitmap_or_many_heap( + count, (const roaring_bitmap_t **)bitmaps); + marker = roaring_bitmap_get_cardinality(totalorbitmap); + roaring_bitmap_free(totalorbitmap); + return marker; + } +}; +auto TotalUnionHeap = BasicBench; +BENCHMARK(TotalUnionHeap); + +struct random_access { + static uint64_t run() { + uint64_t marker = 0; + for (size_t i = 0; i < count; ++i) { + marker += roaring_bitmap_contains(bitmaps[i], maxvalue / 4); + marker += roaring_bitmap_contains(bitmaps[i], maxvalue / 2); + marker += roaring_bitmap_contains(bitmaps[i], 3 * maxvalue / 4); + } + return marker; + } +}; +auto RandomAccess = BasicBench; +BENCHMARK(RandomAccess); + +struct to_array { + static uint64_t run() { + uint64_t marker = 0; + for (size_t i = 0; i < count; ++i) { + roaring_bitmap_to_uint32_array(bitmaps[i], array_buffer); + marker += array_buffer[0]; + } + return marker; + } +}; +auto ToArray = BasicBench; +BENCHMARK(ToArray); + +struct iterate_all { + static uint64_t run() { + uint64_t marker = 0; + for (size_t i = 0; i < count; ++i) { + roaring_bitmap_t *r = bitmaps[i]; + roaring_uint32_iterator_t j; + roaring_init_iterator(r, &j); + while (j.has_value) { + marker++; + roaring_advance_uint32_iterator(&j); + } + } + return marker; + } +}; +auto IterateAll = BasicBench; +BENCHMARK(IterateAll); + +int main(int argc, char **argv) { + const char *dir_name; + if ((argc == 1) || (argc > 1 && argv[1][0] == '-')) { + benchmark::AddCustomContext( + "benchmarking other files", + "You may pass is a data directory as a parameter."); + dir_name = BENCHMARK_DATA_DIR "census1881"; + } else { + dir_name = argv[1]; + } + int number_loaded = load(dir_name); +#if (__APPLE__ && __aarch64__) || defined(__linux__) + if (!collector.has_events()) { + benchmark::AddCustomContext("performance counters", + "No privileged access (sudo may help)."); + } +#else + if (!collector.has_events()) { + benchmark::AddCustomContext("performance counters", + "Unsupported system."); + } +#endif + benchmark::AddCustomContext("data source", dir_name); + + benchmark::AddCustomContext("number of bitmaps", std::to_string(count)); + + benchmark::AddCustomContext( + "In RAM volume in MiB (estimated)", + std::to_string(bitmap_examples_bytes / (1024 * 1024.0))); + if (number_loaded == -1) { + return EXIT_FAILURE; + } + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + for (size_t i = 0; i < count; ++i) { + roaring_bitmap_free(bitmaps[i]); + } + free(array_buffer); +} \ No newline at end of file diff --git a/microbenchmarks/bench.h b/microbenchmarks/bench.h new file mode 100644 index 000000000..755df5811 --- /dev/null +++ b/microbenchmarks/bench.h @@ -0,0 +1,235 @@ +#ifndef CROARING_MICROBENCHMARKS_BENCH_H +#define CROARING_MICROBENCHMARKS_BENCH_H +// clang-format off +#include +#include +#include +#include +#include +#include + + +#include +#include + +#include "performancecounters/event_counter.h" +// clang-format on + +event_collector collector; +size_t N = 1000; +size_t bitmap_examples_bytes = 0; +size_t count = 0; +roaring_bitmap_t **bitmaps = NULL; +uint32_t * array_buffer; +uint32_t maxvalue = 0; +uint32_t maxcard = 0; + +/** + * Read the content of a file to a char array. Caller is + * responsible for memory de-allocation. + * Returns NULL on error. + * + * (If the individual files are small, this function is + * a good idea.) + */ +static char *read_file(const char *filename) { + FILE *fp = fopen(filename, "r"); + if (!fp) { + printf("Could not open file %s\n", filename); + return NULL; + } + + fseek(fp, 0, SEEK_END); + size_t size = (size_t)ftell(fp); + rewind(fp); + char *answer = (char *)malloc(size + 1); + if (!answer) { + fclose(fp); + return NULL; + } + if (fread(answer, size, 1, fp) != 1) { + free(answer); + return NULL; + } + answer[size] = '\0'; + fclose(fp); + return answer; +} + +/** + * Given a file made of comma-separated integers, + * read it all and generate an array of integers. + * The caller is responsible for memory de-allocation. + */ +static uint32_t *read_integer_file(const char *filename, size_t *howmany) { + char *buffer = read_file(filename); + if (buffer == NULL) return NULL; + + size_t howmanyints = 1; + size_t i1 = 0; + for (; buffer[i1] != '\0'; i1++) { + if (buffer[i1] == ',') ++howmanyints; + } + + uint32_t *answer = (uint32_t *)malloc(howmanyints * sizeof(uint32_t)); + if (answer == NULL) return NULL; + size_t pos = 0; + for (size_t i = 0; (i < i1) && (buffer[i] != '\0'); i++) { + uint32_t currentint; + while ((buffer[i] < '0') || (buffer[i] > '9')) { + i++; + if (buffer[i] == '\0') goto END; + } + currentint = (uint32_t)(buffer[i] - '0'); + i++; + for (; (buffer[i] >= '0') && (buffer[i] <= '9'); i++) + currentint = currentint * 10 + (uint32_t)(buffer[i] - '0'); + answer[pos++] = currentint; + } +END: + if (pos != howmanyints) { + printf("unexpected number of integers! %d %d \n", (int)pos, + (int)howmanyints); + } + *howmany = pos; + free(buffer); + return answer; +} + +/** + * Does the file filename ends with the given extension. + */ +static bool has_extension(const char *filename, const char *extension) { + const char *ext = strrchr(filename, '.'); + return (ext && !strcmp(ext, extension)); +} + +/** + * read all (count) integer files in a directory. Caller is responsible + * for memory de-allocation. In case of error, a NULL is returned. + */ +static uint32_t **read_all_integer_files(const char *dirname, + const char *extension, + size_t **howmany, size_t *tcount) { + struct dirent **entry_list; + + int c = scandir(dirname, &entry_list, 0, alphasort); + if (c < 0) return NULL; + size_t truec = 0; + for (int i = 0; i < c; i++) { + if (has_extension(entry_list[i]->d_name, extension)) ++truec; + } + *tcount = truec; + *howmany = (size_t *)malloc(sizeof(size_t) * (*tcount)); + uint32_t **answer = (uint32_t **)malloc(sizeof(uint32_t *) * (*tcount)); + size_t dirlen = strlen(dirname); + char *modifdirname = (char *)dirname; + if (modifdirname[dirlen - 1] != '/') { + modifdirname = (char *)malloc(dirlen + 2); + strcpy(modifdirname, dirname); + modifdirname[dirlen] = '/'; + modifdirname[dirlen + 1] = '\0'; + dirlen++; + } + for (size_t i = 0, pos = 0; i < (size_t)c; + i++) { /* formerly looped while i < *tcount */ + if (!has_extension(entry_list[i]->d_name, extension)) continue; + size_t filelen = strlen(entry_list[i]->d_name); + char *fullpath = (char *)malloc(dirlen + filelen + 1); + strcpy(fullpath, modifdirname); + strcpy(fullpath + dirlen, entry_list[i]->d_name); + answer[pos] = read_integer_file(fullpath, &((*howmany)[pos])); + pos++; + free(fullpath); + } + if (modifdirname != dirname) { + free(modifdirname); + } + for (int i = 0; i < c; ++i) free(entry_list[i]); + free(entry_list); + return answer; +} +/** + * Once you have collected all the integers, build the bitmaps. + */ +static roaring_bitmap_t **create_all_bitmaps(size_t *howmany, + uint32_t **numbers, size_t tcount, + bool runoptimize, + bool copy_on_write) { + for (size_t i = 0; i < count; i++) { + if (howmany[i] > 0) { + if (maxvalue < numbers[i][howmany[i] - 1]) { + maxvalue = numbers[i][howmany[i] - 1]; + } + } + if(maxcard < howmany[i]) { maxcard = howmany[i]; } + } + if (numbers == NULL) return NULL; + roaring_bitmap_t **answer = + (roaring_bitmap_t **)malloc(sizeof(roaring_bitmap_t *) * tcount); + bitmap_examples_bytes = 0; + for (size_t i = 0; i < tcount; i++) { + answer[i] = roaring_bitmap_of_ptr(howmany[i], numbers[i]); + if (runoptimize) roaring_bitmap_run_optimize(answer[i]); + roaring_bitmap_shrink_to_fit(answer[i]); + bitmap_examples_bytes += roaring_bitmap_size_in_bytes(answer[i]); + roaring_bitmap_set_copy_on_write(answer[i], copy_on_write); + } + array_buffer = (uint32_t*) malloc(maxcard * sizeof(uint32_t)); + return answer; +} + +template +static void BasicBench(benchmark::State &state) { + // volatile to prevent optimizations. + volatile uint64_t marker = 0; + for (auto _ : state) { + marker = func::run(); + } + if (collector.has_events()) { + event_aggregate aggregate{}; + for (size_t i = 0; i < N; i++) { + std::atomic_thread_fence(std::memory_order_acquire); + collector.start(); + marker = func::run(); + std::atomic_thread_fence(std::memory_order_release); + event_count allocate_count = collector.end(); + aggregate << allocate_count; + } + state.counters["cycles"] = aggregate.best.cycles(); + + state.counters["instructions"] = aggregate.best.instructions(); + state.counters["GHz"] = + aggregate.best.cycles() / aggregate.best.elapsed_ns(); + } + (void)marker; +} + + +int load(const char *dirname) { + const char *extension = ".txt"; + bool copy_on_write = false; + bool runoptimize = true; + size_t *howmany; + + uint32_t **numbers = + read_all_integer_files(dirname, extension, &howmany, &count); + if (numbers == NULL) { + printf( + "I could not find or load any data file with extension %s in " + "directory %s.\n", + extension, dirname); + return -1; + } + bitmaps = + create_all_bitmaps(howmany, numbers, count, runoptimize, copy_on_write); + + for (size_t i = 0; i < count; ++i) { + free(numbers[i]); + } + free(howmany); + if (bitmaps == NULL) return -1; + return count; +} + +#endif \ No newline at end of file diff --git a/microbenchmarks/performancecounters/apple_arm_events.h b/microbenchmarks/performancecounters/apple_arm_events.h new file mode 100644 index 000000000..5ce147ee2 --- /dev/null +++ b/microbenchmarks/performancecounters/apple_arm_events.h @@ -0,0 +1,1011 @@ + +// Original design from: +// ============================================================================= +// XNU kperf/kpc +// Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges +// +// References: +// +// XNU source (since xnu 2422.1.72): +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h +// https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c +// +// Lightweight PET (Profile Every Thread, since xnu 3789.1.32): +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c +// +// System Private frameworks (since macOS 10.11, iOS 8.0): +// /System/Library/PrivateFrameworks/kperf.framework +// /System/Library/PrivateFrameworks/kperfdata.framework +// +// Xcode framework (since Xcode 7.0): +// /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework +// +// CPU database (plist files) +// macOS (since macOS 10.11): +// /usr/share/kpep/.plist +// iOS (copied from Xcode, since iOS 10.0, Xcode 8.0): +// /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform +// /DeviceSupport//DeveloperDiskImage.dmg/usr/share/kpep/.plist +// +// +// Created by YaoYuan on 2021. +// Released into the public domain (unlicense.org). +// ============================================================================= + +#ifndef M1CYCLES_H +#define M1CYCLES_H + +#include +#include +#include +#include +#include + +#include // for dlopen() and dlsym() +#include // for mach_absolute_time() +#include // for kdebug trace decode +#include // for sysctl() +#include // for usleep() + +struct performance_counters { + double cycles; + double branches; + double missed_branches; + double instructions; + performance_counters(uint64_t c, uint64_t b, uint64_t m, uint64_t i) + : cycles(c), branches(b), missed_branches(m), instructions(i) {} + performance_counters(double c, double b, double m, double i) + : cycles(c), branches(b), missed_branches(m), instructions(i) {} + performance_counters(double init) + : cycles(init), + branches(init), + missed_branches(init), + instructions(init) {} + + inline performance_counters &operator-=(const performance_counters &other) { + cycles -= other.cycles; + branches -= other.branches; + missed_branches -= other.missed_branches; + instructions -= other.instructions; + return *this; + } + inline performance_counters &min(const performance_counters &other) { + cycles = other.cycles < cycles ? other.cycles : cycles; + branches = other.branches < branches ? other.branches : branches; + missed_branches = other.missed_branches < missed_branches + ? other.missed_branches + : missed_branches; + instructions = + other.instructions < instructions ? other.instructions : instructions; + return *this; + } + inline performance_counters &operator+=(const performance_counters &other) { + cycles += other.cycles; + branches += other.branches; + missed_branches += other.missed_branches; + instructions += other.instructions; + return *this; + } + + inline performance_counters &operator/=(double numerator) { + cycles /= numerator; + branches /= numerator; + missed_branches /= numerator; + instructions /= numerator; + return *this; + } +}; + +inline performance_counters operator-(const performance_counters &a, + const performance_counters &b) { + return performance_counters(a.cycles - b.cycles, a.branches - b.branches, + a.missed_branches - b.missed_branches, + a.instructions - b.instructions); +} + +typedef float f32; +typedef double f64; +typedef int8_t i8; +typedef uint8_t u8; +typedef int16_t i16; +typedef uint16_t u16; +typedef int32_t i32; +typedef uint32_t u32; +typedef int64_t i64; +typedef uint64_t u64; +typedef size_t usize; + +// ----------------------------------------------------------------------------- +// header (reverse engineered) +// This framework wraps some sysctl calls to communicate with the kpc in kernel. +// Most functions requires root privileges, or process is "blessed". +// ----------------------------------------------------------------------------- + +// Cross-platform class constants. +#define KPC_CLASS_FIXED (0) +#define KPC_CLASS_CONFIGURABLE (1) +#define KPC_CLASS_POWER (2) +#define KPC_CLASS_RAWPMU (3) + +// Cross-platform class mask constants. +#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) // 1 +#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2 +#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) // 4 +#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) // 8 + +// PMU version constants. +#define KPC_PMU_ERROR (0) // Error +#define KPC_PMU_INTEL_V3 (1) // Intel +#define KPC_PMU_ARM_APPLE (2) // ARM64 +#define KPC_PMU_INTEL_V2 (3) // Old Intel +#define KPC_PMU_ARM_V2 (4) // Old ARM + +// The maximum number of counters we could read from every class in one go. +// ARMV7: FIXED: 1, CONFIGURABLE: 4 +// ARM32: FIXED: 2, CONFIGURABLE: 6 +// ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8) +// x86: 32 +#define KPC_MAX_COUNTERS 32 + +// Bits for defining what to do on an action. +// Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h +#define KPERF_SAMPLER_TH_INFO (1U << 0) +#define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1) +#define KPERF_SAMPLER_KSTACK (1U << 2) +#define KPERF_SAMPLER_USTACK (1U << 3) +#define KPERF_SAMPLER_PMC_THREAD (1U << 4) +#define KPERF_SAMPLER_PMC_CPU (1U << 5) +#define KPERF_SAMPLER_PMC_CONFIG (1U << 6) +#define KPERF_SAMPLER_MEMINFO (1U << 7) +#define KPERF_SAMPLER_TH_SCHEDULING (1U << 8) +#define KPERF_SAMPLER_TH_DISPATCH (1U << 9) +#define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10) +#define KPERF_SAMPLER_SYS_MEM (1U << 11) +#define KPERF_SAMPLER_TH_INSCYC (1U << 12) +#define KPERF_SAMPLER_TK_INFO (1U << 13) + +// Maximum number of kperf action ids. +#define KPERF_ACTION_MAX (32) + +// Maximum number of kperf timer ids. +#define KPERF_TIMER_MAX (8) + +// x86/arm config registers are 64-bit +typedef u64 kpc_config_t; + +/// Print current CPU identification string to the buffer (same as snprintf), +/// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC +/// database in /usr/share/kpep. +/// @return string's length, or negative value if error occurs. +/// @note This method does not requires root privileges. +/// @details sysctl get(hw.cputype), get(hw.cpusubtype), +/// get(hw.cpufamily), get(machdep.cpu.model) +static int (*kpc_cpu_string)(char *buf, usize buf_size); + +/// Get the version of KPC that's being run. +/// @return See `PMU version constants` above. +/// @details sysctl get(kpc.pmu_version) +static u32 (*kpc_pmu_version)(void); + +/// Get running PMC classes. +/// @return See `class mask constants` above, +/// 0 if error occurs or no class is set. +/// @details sysctl get(kpc.counting) +static u32 (*kpc_get_counting)(void); + +/// Set PMC classes to enable counting. +/// @param classes See `class mask constants` above, set 0 to shutdown counting. +/// @return 0 for success. +/// @details sysctl set(kpc.counting) +static int (*kpc_set_counting)(u32 classes); + +/// Get running PMC classes for current thread. +/// @return See `class mask constants` above, +/// 0 if error occurs or no class is set. +/// @details sysctl get(kpc.thread_counting) +static u32 (*kpc_get_thread_counting)(void); + +/// Set PMC classes to enable counting for current thread. +/// @param classes See `class mask constants` above, set 0 to shutdown counting. +/// @return 0 for success. +/// @details sysctl set(kpc.thread_counting) +static int (*kpc_set_thread_counting)(u32 classes); + +/// Get how many config registers there are for a given mask. +/// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`, +/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. +/// @param classes See `class mask constants` above. +/// @return 0 if error occurs or no class is set. +/// @note This method does not requires root privileges. +/// @details sysctl get(kpc.config_count) +static u32 (*kpc_get_config_count)(u32 classes); + +/// Get config registers. +/// @param classes see `class mask constants` above. +/// @param config Config buffer to receive values, should not smaller than +/// kpc_get_config_count(classes) * sizeof(kpc_config_t). +/// @return 0 for success. +/// @details sysctl get(kpc.config_count), get(kpc.config) +static int (*kpc_get_config)(u32 classes, kpc_config_t *config); + +/// Set config registers. +/// @param classes see `class mask constants` above. +/// @param config Config buffer, should not smaller than +/// kpc_get_config_count(classes) * sizeof(kpc_config_t). +/// @return 0 for success. +/// @details sysctl get(kpc.config_count), set(kpc.config) +static int (*kpc_set_config)(u32 classes, kpc_config_t *config); + +/// Get how many counters there are for a given mask. +/// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`, +/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. +/// @param classes See `class mask constants` above. +/// @note This method does not requires root privileges. +/// @details sysctl get(kpc.counter_count) +static u32 (*kpc_get_counter_count)(u32 classes); + +/// Get counter accumulations. +/// If `all_cpus` is true, the buffer count should not smaller than +/// (cpu_count * counter_count). Otherwize, the buffer count should not smaller +/// than (counter_count). +/// @see kpc_get_counter_count(), kpc_cpu_count(). +/// @param all_cpus true for all CPUs, false for current cpu. +/// @param classes See `class mask constants` above. +/// @param curcpu A pointer to receive current cpu id, can be NULL. +/// @param buf Buffer to receive counter's value. +/// @return 0 for success. +/// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters) +static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu, + u64 *buf); + +/// Get counter accumulations for current thread. +/// @param tid Thread id, should be 0. +/// @param buf_count The number of buf's elements (not bytes), +/// should not smaller than kpc_get_counter_count(). +/// @param buf Buffer to receive counter's value. +/// @return 0 for success. +/// @details sysctl get(kpc.thread_counters) +static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf); + +/// Acquire/release the counters used by the Power Manager. +/// @param val 1:acquire, 0:release +/// @return 0 for success. +/// @details sysctl set(kpc.force_all_ctrs) +static int (*kpc_force_all_ctrs_set)(int val); + +/// Get the state of all_ctrs. +/// @return 0 for success. +/// @details sysctl get(kpc.force_all_ctrs) +static int (*kpc_force_all_ctrs_get)(int *val_out); + +/// Set number of actions, should be `KPERF_ACTION_MAX`. +/// @details sysctl set(kperf.action.count) +static int (*kperf_action_count_set)(u32 count); + +/// Get number of actions. +/// @details sysctl get(kperf.action.count) +static int (*kperf_action_count_get)(u32 *count); + +/// Set what to sample when a trigger fires an action, e.g. +/// `KPERF_SAMPLER_PMC_CPU`. +/// @details sysctl set(kperf.action.samplers) +static int (*kperf_action_samplers_set)(u32 actionid, u32 sample); + +/// Get what to sample when a trigger fires an action. +/// @details sysctl get(kperf.action.samplers) +static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample); + +/// Apply a task filter to the action, -1 to disable filter. +/// @details sysctl set(kperf.action.filter_by_task) +static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port); + +/// Apply a pid filter to the action, -1 to disable filter. +/// @details sysctl set(kperf.action.filter_by_pid) +static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid); + +/// Set number of time triggers, should be `KPERF_TIMER_MAX`. +/// @details sysctl set(kperf.timer.count) +static int (*kperf_timer_count_set)(u32 count); + +/// Get number of time triggers. +/// @details sysctl get(kperf.timer.count) +static int (*kperf_timer_count_get)(u32 *count); + +/// Set timer number and period. +/// @details sysctl set(kperf.timer.period) +static int (*kperf_timer_period_set)(u32 actionid, u64 tick); + +/// Get timer number and period. +/// @details sysctl get(kperf.timer.period) +static int (*kperf_timer_period_get)(u32 actionid, u64 *tick); + +/// Set timer number and actionid. +/// @details sysctl set(kperf.timer.action) +static int (*kperf_timer_action_set)(u32 actionid, u32 timerid); + +/// Get timer number and actionid. +/// @details sysctl get(kperf.timer.action) +static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid); + +/// Set which timer ID does PET (Profile Every Thread). +/// @details sysctl set(kperf.timer.pet_timer) +static int (*kperf_timer_pet_set)(u32 timerid); + +/// Get which timer ID does PET (Profile Every Thread). +/// @details sysctl get(kperf.timer.pet_timer) +static int (*kperf_timer_pet_get)(u32 *timerid); + +/// Enable or disable sampling. +/// @details sysctl set(kperf.sampling) +static int (*kperf_sample_set)(u32 enabled); + +/// Get is currently sampling. +/// @details sysctl get(kperf.sampling) +static int (*kperf_sample_get)(u32 *enabled); + +/// Reset kperf: stop sampling, kdebug, timers and actions. +/// @return 0 for success. +static int (*kperf_reset)(void); + +/// Nanoseconds to CPU ticks. +static u64 (*kperf_ns_to_ticks)(u64 ns); + +/// CPU ticks to nanoseconds. +static u64 (*kperf_ticks_to_ns)(u64 ticks); + +/// CPU ticks frequency (mach_absolute_time). +static u64 (*kperf_tick_frequency)(void); + +// ----------------------------------------------------------------------------- +// header (reverse engineered) +// This framework provides some functions to access the local CPU database. +// These functions do not require root privileges. +// ----------------------------------------------------------------------------- + +// KPEP CPU archtecture constants. +#define KPEP_ARCH_I386 0 +#define KPEP_ARCH_X86_64 1 +#define KPEP_ARCH_ARM 2 +#define KPEP_ARCH_ARM64 3 + +/// KPEP event (size: 48/28 bytes on 64/32 bit OS) +typedef struct kpep_event { + const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY". + const char *description; ///< Description for this event. + const char *errata; ///< Errata, currently NULL. + const char *alias; ///< Alias name, such as "Instructions", "Cycles". + const char *fallback; ///< Fallback event name for fixed counter. + u32 mask; + u8 number; + u8 umask; + u8 reserved; + u8 is_fixed; +} kpep_event; + +/// KPEP database (size: 144/80 bytes on 64/32 bit OS) +typedef struct kpep_db { + const char *name; ///< Database name, such as "haswell". + const char *cpu_id; ///< Plist name, such as "cpu_7_8_10b282dc". + const char *marketing_name; ///< Marketing name, such as "Intel Haswell". + void *plist_data; ///< Plist data (CFDataRef), currently NULL. + void *event_map; ///< All events (CFDict). + kpep_event + *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count). + kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *) + ///< * fixed_counter_count) + void *alias_map; ///< All aliases (CFDict). + usize reserved_1; + usize reserved_2; + usize reserved_3; + usize event_count; ///< All events count. + usize alias_count; + usize fixed_counter_count; + usize config_counter_count; + usize power_counter_count; + u32 archtecture; ///< see `KPEP CPU archtecture constants` above. + u32 fixed_counter_bits; + u32 config_counter_bits; + u32 power_counter_bits; +} kpep_db; + +/// KPEP config (size: 80/44 bytes on 64/32 bit OS) +typedef struct kpep_config { + kpep_db *db; + kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL + usize *ev_map; ///< (sizeof(usize *) * counter_count), init 0 + usize *ev_idx; ///< (sizeof(usize *) * counter_count), init -1 + u32 *flags; ///< (sizeof(u32 *) * counter_count), init 0 + u64 *kpc_periods; ///< (sizeof(u64 *) * counter_count), init 0 + usize event_count; /// kpep_config_events_count() + usize counter_count; + u32 classes; ///< See `class mask constants` above. + u32 config_counter; + u32 power_counter; + u32 reserved; +} kpep_config; + +/// Error code for kpep_config_xxx() and kpep_db_xxx() functions. +typedef enum { + KPEP_CONFIG_ERROR_NONE = 0, + KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1, + KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2, + KPEP_CONFIG_ERROR_IO = 3, + KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4, + KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5, + KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6, + KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7, + KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8, + KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9, + KPEP_CONFIG_ERROR_DB_CORRUPT = 10, + KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11, + KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12, + KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13, + KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14, + KPEP_CONFIG_ERROR_ERRNO = 15, + KPEP_CONFIG_ERROR_MAX +} kpep_config_error_code; + +/// Error description for kpep_config_error_code. +static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = { + "none", + "invalid argument", + "out of memory", + "I/O", + "buffer too small", + "current system unknown", + "database path invalid", + "database not found", + "database architecture unsupported", + "database version unsupported", + "database corrupt", + "event not found", + "conflicting events", + "all counters must be forced", + "event unavailable", + "check errno"}; + +/// Error description. +static const char *kpep_config_error_desc(int code) { + if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) { + return kpep_config_error_names[code]; + } + return "unknown error"; +} + +/// Create a config. +/// @param db A kpep db, see kpep_db_create() +/// @param cfg_ptr A pointer to receive the new config. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr); + +/// Free the config. +static void (*kpep_config_free)(kpep_config *cfg); + +/// Add an event to config. +/// @param cfg The config. +/// @param ev_ptr A event pointer. +/// @param flag 0: all, 1: user space only +/// @param err Error bitmap pointer, can be NULL. +/// If return value is `CONFLICTING_EVENTS`, this bitmap contains +/// the conflicted event indices, e.g. "1 << 2" means index 2. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr, + u32 flag, u32 *err); + +/// Remove event at index. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx); + +/// Force all counters. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_force_counters)(kpep_config *cfg); + +/// Get events count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr); + +/// Get all event pointers. +/// @param buf A buffer to receive event pointers. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_events_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf, + usize buf_size); + +/// Get kpc register configs. +/// @param buf A buffer to receive kpc register configs. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_kpc_count() * sizeof(kpc_config_t). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf, + usize buf_size); + +/// Get kpc register config count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr); + +/// Get kpc classes. +/// @param classes See `class mask constants` above. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr); + +/// Get the index mapping from event to counter. +/// @param buf A buffer to receive indexes. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_events_count() * sizeof(kpc_config_t). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size); + +/// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/". +/// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8". +/// Pass NULL for current CPU. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_create)(const char *name, kpep_db **db_ptr); + +/// Free the kpep database. +static void (*kpep_db_free)(kpep_db *db); + +/// Get the database's name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_name)(kpep_db *db, const char **name); + +/// Get the event alias count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_aliases_count)(kpep_db *db, usize *count); + +/// Get all alias. +/// @param buf A buffer to receive all alias strings. +/// @param buf_size The buffer's size in bytes, +/// should not smaller than kpep_db_aliases_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size); + +/// Get counters count for given classes. +/// @param classes 1: Fixed, 2: Configurable. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count); + +/// Get all event count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_events_count)(kpep_db *db, usize *count); + +/// Get all events. +/// @param buf A buffer to receive all event pointers. +/// @param buf_size The buffer's size in bytes, +/// should not smaller than kpep_db_events_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size); + +/// Get one event by name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr); + +/// Get event's name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr); + +/// Get event's alias. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr); + +/// Get event's description. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr); + +// ----------------------------------------------------------------------------- +// load kperf/kperfdata dynamic library +// ----------------------------------------------------------------------------- + +typedef struct { + const char *name; + void **impl; +} lib_symbol; + +#define lib_nelems(x) (sizeof(x) / sizeof((x)[0])) +#define lib_symbol_def(name) \ + { \ +#name, (void **)&name \ + } + +static const lib_symbol lib_symbols_kperf[] = { + lib_symbol_def(kpc_pmu_version), + lib_symbol_def(kpc_cpu_string), + lib_symbol_def(kpc_set_counting), + lib_symbol_def(kpc_get_counting), + lib_symbol_def(kpc_set_thread_counting), + lib_symbol_def(kpc_get_thread_counting), + lib_symbol_def(kpc_get_config_count), + lib_symbol_def(kpc_get_counter_count), + lib_symbol_def(kpc_set_config), + lib_symbol_def(kpc_get_config), + lib_symbol_def(kpc_get_cpu_counters), + lib_symbol_def(kpc_get_thread_counters), + lib_symbol_def(kpc_force_all_ctrs_set), + lib_symbol_def(kpc_force_all_ctrs_get), + lib_symbol_def(kperf_action_count_set), + lib_symbol_def(kperf_action_count_get), + lib_symbol_def(kperf_action_samplers_set), + lib_symbol_def(kperf_action_samplers_get), + lib_symbol_def(kperf_action_filter_set_by_task), + lib_symbol_def(kperf_action_filter_set_by_pid), + lib_symbol_def(kperf_timer_count_set), + lib_symbol_def(kperf_timer_count_get), + lib_symbol_def(kperf_timer_period_set), + lib_symbol_def(kperf_timer_period_get), + lib_symbol_def(kperf_timer_action_set), + lib_symbol_def(kperf_timer_action_get), + lib_symbol_def(kperf_sample_set), + lib_symbol_def(kperf_sample_get), + lib_symbol_def(kperf_reset), + lib_symbol_def(kperf_timer_pet_set), + lib_symbol_def(kperf_timer_pet_get), + lib_symbol_def(kperf_ns_to_ticks), + lib_symbol_def(kperf_ticks_to_ns), + lib_symbol_def(kperf_tick_frequency), +}; + +static const lib_symbol lib_symbols_kperfdata[] = { + lib_symbol_def(kpep_config_create), + lib_symbol_def(kpep_config_free), + lib_symbol_def(kpep_config_add_event), + lib_symbol_def(kpep_config_remove_event), + lib_symbol_def(kpep_config_force_counters), + lib_symbol_def(kpep_config_events_count), + lib_symbol_def(kpep_config_events), + lib_symbol_def(kpep_config_kpc), + lib_symbol_def(kpep_config_kpc_count), + lib_symbol_def(kpep_config_kpc_classes), + lib_symbol_def(kpep_config_kpc_map), + lib_symbol_def(kpep_db_create), + lib_symbol_def(kpep_db_free), + lib_symbol_def(kpep_db_name), + lib_symbol_def(kpep_db_aliases_count), + lib_symbol_def(kpep_db_aliases), + lib_symbol_def(kpep_db_counters_count), + lib_symbol_def(kpep_db_events_count), + lib_symbol_def(kpep_db_events), + lib_symbol_def(kpep_db_event), + lib_symbol_def(kpep_event_name), + lib_symbol_def(kpep_event_alias), + lib_symbol_def(kpep_event_description), +}; + +#define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf" +#define lib_path_kperfdata \ + "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata" + +static bool lib_inited = false; +static bool lib_has_err = false; +static char lib_err_msg[256]; + +static void *lib_handle_kperf = NULL; +static void *lib_handle_kperfdata = NULL; + +static void lib_deinit(void) { + lib_inited = false; + lib_has_err = false; + if (lib_handle_kperf) dlclose(lib_handle_kperf); + if (lib_handle_kperfdata) dlclose(lib_handle_kperfdata); + lib_handle_kperf = NULL; + lib_handle_kperfdata = NULL; + for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { + const lib_symbol *symbol = &lib_symbols_kperf[i]; + *symbol->impl = NULL; + } + for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { + const lib_symbol *symbol = &lib_symbols_kperfdata[i]; + *symbol->impl = NULL; + } +} + +static bool lib_init(void) { +#define return_err() \ + do { \ + lib_deinit(); \ + lib_inited = true; \ + lib_has_err = true; \ + return false; \ + } while (false) + + if (lib_inited) return !lib_has_err; + + // load dynamic library + lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY); + if (!lib_handle_kperf) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperf.framework, message: %s.", dlerror()); + return_err(); + } + lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY); + if (!lib_handle_kperfdata) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperfdata.framework, message: %s.", dlerror()); + return_err(); + } + + // load symbol address from dynamic library + for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { + const lib_symbol *symbol = &lib_symbols_kperf[i]; + *symbol->impl = dlsym(lib_handle_kperf, symbol->name); + if (!*symbol->impl) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperf function: %s.", symbol->name); + return_err(); + } + } + for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { + const lib_symbol *symbol = &lib_symbols_kperfdata[i]; + *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name); + if (!*symbol->impl) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperfdata function: %s.", symbol->name); + return_err(); + } + } + + lib_inited = true; + lib_has_err = false; + return true; + +#undef return_err +} + +// ----------------------------------------------------------------------------- +// kdebug private structs +// https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h +// ----------------------------------------------------------------------------- + +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__arm64__) +typedef uint64_t kd_buf_argtype; +#else +typedef uintptr_t kd_buf_argtype; +#endif + +typedef struct { + uint64_t timestamp; + kd_buf_argtype arg1; + kd_buf_argtype arg2; + kd_buf_argtype arg3; + kd_buf_argtype arg4; + kd_buf_argtype arg5; /* the thread ID */ + uint32_t debugid; /* see */ + +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__LP64__) || defined(__arm64__) + uint32_t cpuid; /* cpu index, from 0 */ + kd_buf_argtype unused; +#endif +} kd_buf; + +/* bits for the type field of kd_regtype */ +#define KDBG_CLASSTYPE 0x10000 +#define KDBG_SUBCLSTYPE 0x20000 +#define KDBG_RANGETYPE 0x40000 +#define KDBG_TYPENONE 0x80000 +#define KDBG_CKTYPES 0xF0000 + +/* only trace at most 4 types of events, at the code granularity */ +#define KDBG_VALCHECK 0x00200000U + +typedef struct { + unsigned int type; + unsigned int value1; + unsigned int value2; + unsigned int value3; + unsigned int value4; +} kd_regtype; + +typedef struct { + /* number of events that can fit in the buffers */ + int nkdbufs; + /* set if trace is disabled */ + int nolog; + /* kd_ctrl_page.flags */ + unsigned int flags; + /* number of threads in thread map */ + int nkdthreads; + /* the owning pid */ + int bufid; +} kbufinfo_t; + + +// ----------------------------------------------------------------------------- +// Demo +// ----------------------------------------------------------------------------- + +#define EVENT_NAME_MAX 8 +typedef struct { + const char *alias; /// name for print + const char *names[EVENT_NAME_MAX]; /// name from pmc db +} event_alias; + +/// Event names from /usr/share/kpep/.plist +static const event_alias profile_events[] = { + {"cycles", + { + "FIXED_CYCLES", // Apple A7-A15 + "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th + "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom + }}, + {"instructions", + { + "FIXED_INSTRUCTIONS", // Apple A7-A15 + "INST_RETIRED.ANY" // Intel Yonah, Merom, Core 1th-10th + }}, + {"branches", + { + "INST_BRANCH", // Apple A7-A15 + "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th + "INST_RETIRED.ANY", // Intel Yonah, Merom + }}, + {"branch-misses", + { + "BRANCH_MISPRED_NONSPEC", // Apple A7-A15, since iOS 15, macOS 12 + "BRANCH_MISPREDICT", // Apple A7-A14 + "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th + "BR_INST_RETIRED.MISPRED", // Intel Yonah, Merom + }}, +}; + +static kpep_event *get_event(kpep_db *db, const event_alias *alias) { + for (usize j = 0; j < EVENT_NAME_MAX; j++) { + const char *name = alias->names[j]; + if (!name) break; + kpep_event *ev = NULL; + if (kpep_db_event(db, name, &ev) == 0) { + return ev; + } + } + return NULL; +} + +struct AppleEvents { + kpc_config_t regs[KPC_MAX_COUNTERS] = {0}; + usize counter_map[KPC_MAX_COUNTERS] = {0}; + u64 counters_0[KPC_MAX_COUNTERS] = {0}; + u64 counters_1[KPC_MAX_COUNTERS] = {0}; + static constexpr usize ev_count = + sizeof(profile_events) / sizeof(profile_events[0]); + + inline bool setup_performance_counters() { + static bool init = false; + static bool worked = false; + + if (init) { + return worked; + } + init = true; + + // load dylib + if (!lib_init()) { + printf("Error: %s\n", lib_err_msg); + return (worked = false); + } + + // check permission + int force_ctrs = 0; + if (kpc_force_all_ctrs_get(&force_ctrs)) { + printf("Permission denied, xnu/kpc requires root privileges.\n"); + return (worked = false); + } + int ret; + // load pmc db + kpep_db *db = NULL; + if ((ret = kpep_db_create(NULL, &db))) { + printf("Error: cannot load pmc database: %d.\n", ret); + return (worked = false); + } + // printf("loaded db: %s (%s)\n", db->name, db->marketing_name); + // printf("number of fixed counters: %zu\n", db->fixed_counter_count); + // printf("number of configurable counters: %zu\n", + // db->config_counter_count); + + // create a config + kpep_config *cfg = NULL; + if ((ret = kpep_config_create(db, &cfg))) { + printf("Failed to create kpep config: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_force_counters(cfg))) { + printf("Failed to force counters: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + + // get events + kpep_event *ev_arr[ev_count] = {0}; + for (usize i = 0; i < ev_count; i++) { + const event_alias *alias = profile_events + i; + ev_arr[i] = get_event(db, alias); + if (!ev_arr[i]) { + printf("Cannot find event: %s.\n", alias->alias); + return (worked = false); + } + } + + // add event to config + for (usize i = 0; i < ev_count; i++) { + kpep_event *ev = ev_arr[i]; + if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) { + printf("Failed to add event: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + } + + // prepare buffer and config + u32 classes = 0; + usize reg_count = 0; + if ((ret = kpep_config_kpc_classes(cfg, &classes))) { + printf("Failed get kpc classes: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc_count(cfg, ®_count))) { + printf("Failed get kpc count: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) { + printf("Failed get kpc map: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) { + printf("Failed get kpc registers: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + + // set config to kernel + if ((ret = kpc_force_all_ctrs_set(1))) { + printf("Failed force all ctrs: %d.\n", ret); + return (worked = false); + } + if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) { + if ((ret = kpc_set_config(classes, regs))) { + printf("Failed set kpc config: %d.\n", ret); + return (worked = false); + } + } + + // start counting + if ((ret = kpc_set_counting(classes))) { + printf("Failed set counting: %d.\n", ret); + return (worked = false); + } + if ((ret = kpc_set_thread_counting(classes))) { + printf("Failed set thread counting: %d.\n", ret); + return (worked = false); + } + + return (worked = true); + } + + inline performance_counters get_counters() { + static bool warned = false; + int ret; + // get counters before + if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) { + if (!warned) { + printf("Failed get thread counters before: %d.\n", ret); + warned = true; + } + return 1; + } + return performance_counters{ + counters_0[counter_map[0]], counters_0[counter_map[3]], + counters_0[counter_map[2]], counters_0[counter_map[1]]}; + } +}; + +#endif diff --git a/microbenchmarks/performancecounters/event_counter.h b/microbenchmarks/performancecounters/event_counter.h new file mode 100644 index 000000000..63e605690 --- /dev/null +++ b/microbenchmarks/performancecounters/event_counter.h @@ -0,0 +1,150 @@ +#ifndef __EVENT_COUNTER_H +#define __EVENT_COUNTER_H + +#include +#ifndef _MSC_VER +#include +#endif +#include + +#include + +#include +#include + +#include "linux-perf-events.h" +#ifdef __linux__ +#include +#endif + +#if __APPLE__ && __aarch64__ +#include "apple_arm_events.h" +#endif + +struct event_count { + std::chrono::duration elapsed; + std::vector event_counts; + event_count() : elapsed(0), event_counts{0, 0, 0, 0, 0} {} + event_count(const std::chrono::duration _elapsed, + const std::vector _event_counts) + : elapsed(_elapsed), event_counts(_event_counts) {} + event_count(const event_count& other) + : elapsed(other.elapsed), event_counts(other.event_counts) {} + + // The types of counters (so we can read the getter more easily) + enum event_counter_types { + CPU_CYCLES, + INSTRUCTIONS, + }; + + double elapsed_sec() const { + return std::chrono::duration(elapsed).count(); + } + double elapsed_ns() const { + return std::chrono::duration(elapsed).count(); + } + double cycles() const { + return static_cast(event_counts[CPU_CYCLES]); + } + double instructions() const { + return static_cast(event_counts[INSTRUCTIONS]); + } + + event_count& operator=(const event_count& other) { + this->elapsed = other.elapsed; + this->event_counts = other.event_counts; + return *this; + } + event_count operator+(const event_count& other) const { + return event_count(elapsed + other.elapsed, + { + event_counts[0] + other.event_counts[0], + event_counts[1] + other.event_counts[1], + event_counts[2] + other.event_counts[2], + event_counts[3] + other.event_counts[3], + event_counts[4] + other.event_counts[4], + }); + } + + void operator+=(const event_count& other) { *this = *this + other; } +}; + +struct event_aggregate { + bool has_events = false; + int iterations = 0; + event_count total{}; + event_count best{}; + event_count worst{}; + + event_aggregate() = default; + + void operator<<(const event_count& other) { + if (iterations == 0 || other.elapsed < best.elapsed) { + best = other; + } + if (iterations == 0 || other.elapsed > worst.elapsed) { + worst = other; + } + iterations++; + total += other; + } + + double elapsed_sec() const { return total.elapsed_sec() / iterations; } + double elapsed_ns() const { return total.elapsed_ns() / iterations; } + double cycles() const { return total.cycles() / iterations; } + double instructions() const { return total.instructions() / iterations; } +}; + +struct event_collector { + event_count count{}; + std::chrono::time_point start_clock{}; + +#if defined(__linux__) + LinuxEvents linux_events; + event_collector() + : linux_events(std::vector{ + PERF_COUNT_HW_CPU_CYCLES, + PERF_COUNT_HW_INSTRUCTIONS, + }) {} + bool has_events() { return linux_events.is_working(); } +#elif __APPLE__ && __aarch64__ + AppleEvents apple_events; + performance_counters diff; + event_collector() : diff(0) { apple_events.setup_performance_counters(); } + bool has_events() { return apple_events.setup_performance_counters(); } +#else + event_collector() {} + bool has_events() { return false; } +#endif + + inline void start() { +#if defined(__linux) + linux_events.start(); +#elif __APPLE__ && __aarch64__ + if (has_events()) { + diff = apple_events.get_counters(); + } +#endif + start_clock = std::chrono::steady_clock::now(); + } + inline event_count& end() { + const auto end_clock = std::chrono::steady_clock::now(); +#if defined(__linux) + linux_events.end(count.event_counts); +#elif __APPLE__ && __aarch64__ + if (has_events()) { + performance_counters end = apple_events.get_counters(); + diff = end - diff; + } + count.event_counts[0] = diff.cycles; + count.event_counts[1] = diff.instructions; + count.event_counts[2] = diff.missed_branches; + count.event_counts[3] = 0; + count.event_counts[4] = diff.branches; +#endif + count.elapsed = end_clock - start_clock; + return count; + } +}; + +#endif diff --git a/microbenchmarks/performancecounters/ibireme.h b/microbenchmarks/performancecounters/ibireme.h new file mode 100644 index 000000000..363d5d03b --- /dev/null +++ b/microbenchmarks/performancecounters/ibireme.h @@ -0,0 +1,917 @@ +// ============================================================================= +// XNU kperf/kpc +// Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges +// +// References: +// +// XNU source (since xnu 2422.1.72): +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h +// https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c +// +// Lightweight PET (Profile Every Thread, since xnu 3789.1.32): +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c +// +// System Private frameworks (since macOS 10.11, iOS 8.0): +// /System/Library/PrivateFrameworks/kperf.framework +// /System/Library/PrivateFrameworks/kperfdata.framework +// +// Xcode framework (since Xcode 7.0): +// /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework +// +// CPU database (plist files) +// macOS (since macOS 10.11): +// /usr/share/kpep/.plist +// iOS (copied from Xcode, since iOS 10.0, Xcode 8.0): +// /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform +// /DeviceSupport//DeveloperDiskImage.dmg/usr/share/kpep/.plist +// +// +// Created by YaoYuan on 2021. +// Released into the public domain (unlicense.org). +// ============================================================================= + +#include +#include +#include +#include +#include + +#include // for dlopen() and dlsym() +#include // for mach_absolute_time() +#include // for kdebug trace decode +#include // for sysctl() +#include // for usleep() + +typedef float f32; +typedef double f64; +typedef int8_t i8; +typedef uint8_t u8; +typedef int16_t i16; +typedef uint16_t u16; +typedef int32_t i32; +typedef uint32_t u32; +typedef int64_t i64; +typedef uint64_t u64; +typedef size_t usize; + +// ----------------------------------------------------------------------------- +// header (reverse engineered) +// This framework wraps some sysctl calls to communicate with the kpc in kernel. +// Most functions requires root privileges, or process is "blessed". +// ----------------------------------------------------------------------------- + +// Cross-platform class constants. +#define KPC_CLASS_FIXED (0) +#define KPC_CLASS_CONFIGURABLE (1) +#define KPC_CLASS_POWER (2) +#define KPC_CLASS_RAWPMU (3) + +// Cross-platform class mask constants. +#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) // 1 +#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2 +#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) // 4 +#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) // 8 + +// PMU version constants. +#define KPC_PMU_ERROR (0) // Error +#define KPC_PMU_INTEL_V3 (1) // Intel +#define KPC_PMU_ARM_APPLE (2) // ARM64 +#define KPC_PMU_INTEL_V2 (3) // Old Intel +#define KPC_PMU_ARM_V2 (4) // Old ARM + +// The maximum number of counters we could read from every class in one go. +// ARMV7: FIXED: 1, CONFIGURABLE: 4 +// ARM32: FIXED: 2, CONFIGURABLE: 6 +// ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8) +// x86: 32 +#define KPC_MAX_COUNTERS 32 + +// Bits for defining what to do on an action. +// Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h +#define KPERF_SAMPLER_TH_INFO (1U << 0) +#define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1) +#define KPERF_SAMPLER_KSTACK (1U << 2) +#define KPERF_SAMPLER_USTACK (1U << 3) +#define KPERF_SAMPLER_PMC_THREAD (1U << 4) +#define KPERF_SAMPLER_PMC_CPU (1U << 5) +#define KPERF_SAMPLER_PMC_CONFIG (1U << 6) +#define KPERF_SAMPLER_MEMINFO (1U << 7) +#define KPERF_SAMPLER_TH_SCHEDULING (1U << 8) +#define KPERF_SAMPLER_TH_DISPATCH (1U << 9) +#define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10) +#define KPERF_SAMPLER_SYS_MEM (1U << 11) +#define KPERF_SAMPLER_TH_INSCYC (1U << 12) +#define KPERF_SAMPLER_TK_INFO (1U << 13) + +// Maximum number of kperf action ids. +#define KPERF_ACTION_MAX (32) + +// Maximum number of kperf timer ids. +#define KPERF_TIMER_MAX (8) + +// x86/arm config registers are 64-bit +typedef u64 kpc_config_t; + +/// Print current CPU identification string to the buffer (same as snprintf), +/// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC +/// database in /usr/share/kpep. +/// @return string's length, or negative value if error occurs. +/// @note This method does not requires root privileges. +/// @details sysctl get(hw.cputype), get(hw.cpusubtype), +/// get(hw.cpufamily), get(machdep.cpu.model) +static int (*kpc_cpu_string)(char *buf, usize buf_size); + +/// Get the version of KPC that's being run. +/// @return See `PMU version constants` above. +/// @details sysctl get(kpc.pmu_version) +static u32 (*kpc_pmu_version)(void); + +/// Get running PMC classes. +/// @return See `class mask constants` above, +/// 0 if error occurs or no class is set. +/// @details sysctl get(kpc.counting) +static u32 (*kpc_get_counting)(void); + +/// Set PMC classes to enable counting. +/// @param classes See `class mask constants` above, set 0 to shutdown counting. +/// @return 0 for success. +/// @details sysctl set(kpc.counting) +static int (*kpc_set_counting)(u32 classes); + +/// Get running PMC classes for current thread. +/// @return See `class mask constants` above, +/// 0 if error occurs or no class is set. +/// @details sysctl get(kpc.thread_counting) +static u32 (*kpc_get_thread_counting)(void); + +/// Set PMC classes to enable counting for current thread. +/// @param classes See `class mask constants` above, set 0 to shutdown counting. +/// @return 0 for success. +/// @details sysctl set(kpc.thread_counting) +static int (*kpc_set_thread_counting)(u32 classes); + +/// Get how many config registers there are for a given mask. +/// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`, +/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. +/// @param classes See `class mask constants` above. +/// @return 0 if error occurs or no class is set. +/// @note This method does not requires root privileges. +/// @details sysctl get(kpc.config_count) +static u32 (*kpc_get_config_count)(u32 classes); + +/// Get config registers. +/// @param classes see `class mask constants` above. +/// @param config Config buffer to receive values, should not smaller than +/// kpc_get_config_count(classes) * sizeof(kpc_config_t). +/// @return 0 for success. +/// @details sysctl get(kpc.config_count), get(kpc.config) +static int (*kpc_get_config)(u32 classes, kpc_config_t *config); + +/// Set config registers. +/// @param classes see `class mask constants` above. +/// @param config Config buffer, should not smaller than +/// kpc_get_config_count(classes) * sizeof(kpc_config_t). +/// @return 0 for success. +/// @details sysctl get(kpc.config_count), set(kpc.config) +static int (*kpc_set_config)(u32 classes, kpc_config_t *config); + +/// Get how many counters there are for a given mask. +/// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`, +/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. +/// @param classes See `class mask constants` above. +/// @note This method does not requires root privileges. +/// @details sysctl get(kpc.counter_count) +static u32 (*kpc_get_counter_count)(u32 classes); + +/// Get counter accumulations. +/// If `all_cpus` is true, the buffer count should not smaller than +/// (cpu_count * counter_count). Otherwize, the buffer count should not smaller +/// than (counter_count). +/// @see kpc_get_counter_count(), kpc_cpu_count(). +/// @param all_cpus true for all CPUs, false for current cpu. +/// @param classes See `class mask constants` above. +/// @param curcpu A pointer to receive current cpu id, can be NULL. +/// @param buf Buffer to receive counter's value. +/// @return 0 for success. +/// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters) +static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu, + u64 *buf); + +/// Get counter accumulations for current thread. +/// @param tid Thread id, should be 0. +/// @param buf_count The number of buf's elements (not bytes), +/// should not smaller than kpc_get_counter_count(). +/// @param buf Buffer to receive counter's value. +/// @return 0 for success. +/// @details sysctl get(kpc.thread_counters) +static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf); + +/// Acquire/release the counters used by the Power Manager. +/// @param val 1:acquire, 0:release +/// @return 0 for success. +/// @details sysctl set(kpc.force_all_ctrs) +static int (*kpc_force_all_ctrs_set)(int val); + +/// Get the state of all_ctrs. +/// @return 0 for success. +/// @details sysctl get(kpc.force_all_ctrs) +static int (*kpc_force_all_ctrs_get)(int *val_out); + +/// Set number of actions, should be `KPERF_ACTION_MAX`. +/// @details sysctl set(kperf.action.count) +static int (*kperf_action_count_set)(u32 count); + +/// Get number of actions. +/// @details sysctl get(kperf.action.count) +static int (*kperf_action_count_get)(u32 *count); + +/// Set what to sample when a trigger fires an action, e.g. +/// `KPERF_SAMPLER_PMC_CPU`. +/// @details sysctl set(kperf.action.samplers) +static int (*kperf_action_samplers_set)(u32 actionid, u32 sample); + +/// Get what to sample when a trigger fires an action. +/// @details sysctl get(kperf.action.samplers) +static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample); + +/// Apply a task filter to the action, -1 to disable filter. +/// @details sysctl set(kperf.action.filter_by_task) +static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port); + +/// Apply a pid filter to the action, -1 to disable filter. +/// @details sysctl set(kperf.action.filter_by_pid) +static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid); + +/// Set number of time triggers, should be `KPERF_TIMER_MAX`. +/// @details sysctl set(kperf.timer.count) +static int (*kperf_timer_count_set)(u32 count); + +/// Get number of time triggers. +/// @details sysctl get(kperf.timer.count) +static int (*kperf_timer_count_get)(u32 *count); + +/// Set timer number and period. +/// @details sysctl set(kperf.timer.period) +static int (*kperf_timer_period_set)(u32 actionid, u64 tick); + +/// Get timer number and period. +/// @details sysctl get(kperf.timer.period) +static int (*kperf_timer_period_get)(u32 actionid, u64 *tick); + +/// Set timer number and actionid. +/// @details sysctl set(kperf.timer.action) +static int (*kperf_timer_action_set)(u32 actionid, u32 timerid); + +/// Get timer number and actionid. +/// @details sysctl get(kperf.timer.action) +static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid); + +/// Set which timer ID does PET (Profile Every Thread). +/// @details sysctl set(kperf.timer.pet_timer) +static int (*kperf_timer_pet_set)(u32 timerid); + +/// Get which timer ID does PET (Profile Every Thread). +/// @details sysctl get(kperf.timer.pet_timer) +static int (*kperf_timer_pet_get)(u32 *timerid); + +/// Enable or disable sampling. +/// @details sysctl set(kperf.sampling) +static int (*kperf_sample_set)(u32 enabled); + +/// Get is currently sampling. +/// @details sysctl get(kperf.sampling) +static int (*kperf_sample_get)(u32 *enabled); + +/// Reset kperf: stop sampling, kdebug, timers and actions. +/// @return 0 for success. +static int (*kperf_reset)(void); + +/// Nanoseconds to CPU ticks. +static u64 (*kperf_ns_to_ticks)(u64 ns); + +/// CPU ticks to nanoseconds. +static u64 (*kperf_ticks_to_ns)(u64 ticks); + +/// CPU ticks frequency (mach_absolute_time). +static u64 (*kperf_tick_frequency)(void); + +/// Get lightweight PET mode (not in kperf.framework). +static int kperf_lightweight_pet_get(u32 *enabled) { + if (!enabled) + return -1; + usize size = 4; + return sysctlbyname("kperf.lightweight_pet", enabled, &size, NULL, 0); +} + +/// Set lightweight PET mode (not in kperf.framework). +static int kperf_lightweight_pet_set(u32 enabled) { + return sysctlbyname("kperf.lightweight_pet", NULL, NULL, &enabled, 4); +} + +// ----------------------------------------------------------------------------- +// header (reverse engineered) +// This framework provides some functions to access the local CPU database. +// These functions do not require root privileges. +// ----------------------------------------------------------------------------- + +// KPEP CPU archtecture constants. +#define KPEP_ARCH_I386 0 +#define KPEP_ARCH_X86_64 1 +#define KPEP_ARCH_ARM 2 +#define KPEP_ARCH_ARM64 3 + +/// KPEP event (size: 48/28 bytes on 64/32 bit OS) +typedef struct kpep_event { + const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY". + const char *description; ///< Description for this event. + const char *errata; ///< Errata, currently NULL. + const char *alias; ///< Alias name, such as "Instructions", "Cycles". + const char *fallback; ///< Fallback event name for fixed counter. + u32 mask; + u8 number; + u8 umask; + u8 reserved; + u8 is_fixed; +} kpep_event; + +/// KPEP database (size: 144/80 bytes on 64/32 bit OS) +typedef struct kpep_db { + const char *name; ///< Database name, such as "haswell". + const char *cpu_id; ///< Plist name, such as "cpu_7_8_10b282dc". + const char *marketing_name; ///< Marketing name, such as "Intel Haswell". + void *plist_data; ///< Plist data (CFDataRef), currently NULL. + void *event_map; ///< All events (CFDict). + kpep_event + *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count). + kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *) + ///< * fixed_counter_count) + void *alias_map; ///< All aliases (CFDict). + usize reserved_1; + usize reserved_2; + usize reserved_3; + usize event_count; ///< All events count. + usize alias_count; + usize fixed_counter_count; + usize config_counter_count; + usize power_counter_count; + u32 archtecture; ///< see `KPEP CPU archtecture constants` above. + u32 fixed_counter_bits; + u32 config_counter_bits; + u32 power_counter_bits; +} kpep_db; + +/// KPEP config (size: 80/44 bytes on 64/32 bit OS) +typedef struct kpep_config { + kpep_db *db; + kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL + usize *ev_map; ///< (sizeof(usize *) * counter_count), init 0 + usize *ev_idx; ///< (sizeof(usize *) * counter_count), init -1 + u32 *flags; ///< (sizeof(u32 *) * counter_count), init 0 + u64 *kpc_periods; ///< (sizeof(u64 *) * counter_count), init 0 + usize event_count; /// kpep_config_events_count() + usize counter_count; + u32 classes; ///< See `class mask constants` above. + u32 config_counter; + u32 power_counter; + u32 reserved; +} kpep_config; + +/// Error code for kpep_config_xxx() and kpep_db_xxx() functions. +typedef enum { + KPEP_CONFIG_ERROR_NONE = 0, + KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1, + KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2, + KPEP_CONFIG_ERROR_IO = 3, + KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4, + KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5, + KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6, + KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7, + KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8, + KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9, + KPEP_CONFIG_ERROR_DB_CORRUPT = 10, + KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11, + KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12, + KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13, + KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14, + KPEP_CONFIG_ERROR_ERRNO = 15, + KPEP_CONFIG_ERROR_MAX +} kpep_config_error_code; + +/// Error description for kpep_config_error_code. +static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = { + "none", + "invalid argument", + "out of memory", + "I/O", + "buffer too small", + "current system unknown", + "database path invalid", + "database not found", + "database architecture unsupported", + "database version unsupported", + "database corrupt", + "event not found", + "conflicting events", + "all counters must be forced", + "event unavailable", + "check errno"}; + +/// Error description. +static const char *kpep_config_error_desc(int code) { + if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) { + return kpep_config_error_names[code]; + } + return "unknown error"; +} + +/// Create a config. +/// @param db A kpep db, see kpep_db_create() +/// @param cfg_ptr A pointer to receive the new config. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr); + +/// Free the config. +static void (*kpep_config_free)(kpep_config *cfg); + +/// Add an event to config. +/// @param cfg The config. +/// @param ev_ptr A event pointer. +/// @param flag 0: all, 1: user space only +/// @param err Error bitmap pointer, can be NULL. +/// If return value is `CONFLICTING_EVENTS`, this bitmap contains +/// the conflicted event indices, e.g. "1 << 2" means index 2. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr, + u32 flag, u32 *err); + +/// Remove event at index. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx); + +/// Force all counters. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_force_counters)(kpep_config *cfg); + +/// Get events count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr); + +/// Get all event pointers. +/// @param buf A buffer to receive event pointers. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_events_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf, + usize buf_size); + +/// Get kpc register configs. +/// @param buf A buffer to receive kpc register configs. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_kpc_count() * sizeof(kpc_config_t). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf, + usize buf_size); + +/// Get kpc register config count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr); + +/// Get kpc classes. +/// @param classes See `class mask constants` above. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr); + +/// Get the index mapping from event to counter. +/// @param buf A buffer to receive indexes. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_events_count() * sizeof(kpc_config_t). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size); + +/// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/". +/// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8". +/// Pass NULL for current CPU. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_create)(const char *name, kpep_db **db_ptr); + +/// Free the kpep database. +static void (*kpep_db_free)(kpep_db *db); + +/// Get the database's name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_name)(kpep_db *db, const char **name); + +/// Get the event alias count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_aliases_count)(kpep_db *db, usize *count); + +/// Get all alias. +/// @param buf A buffer to receive all alias strings. +/// @param buf_size The buffer's size in bytes, +/// should not smaller than kpep_db_aliases_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size); + +/// Get counters count for given classes. +/// @param classes 1: Fixed, 2: Configurable. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count); + +/// Get all event count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_events_count)(kpep_db *db, usize *count); + +/// Get all events. +/// @param buf A buffer to receive all event pointers. +/// @param buf_size The buffer's size in bytes, +/// should not smaller than kpep_db_events_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size); + +/// Get one event by name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr); + +/// Get event's name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr); + +/// Get event's alias. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr); + +/// Get event's description. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr); + +// ----------------------------------------------------------------------------- +// load kperf/kperfdata dynamic library +// ----------------------------------------------------------------------------- + +typedef struct { + const char *name; + void **impl; +} lib_symbol; + +#define lib_nelems(x) (sizeof(x) / sizeof((x)[0])) +#define lib_symbol_def(name) \ + { \ +#name, (void **)&name \ + } + +static const lib_symbol lib_symbols_kperf[] = { + lib_symbol_def(kpc_pmu_version), + lib_symbol_def(kpc_cpu_string), + lib_symbol_def(kpc_set_counting), + lib_symbol_def(kpc_get_counting), + lib_symbol_def(kpc_set_thread_counting), + lib_symbol_def(kpc_get_thread_counting), + lib_symbol_def(kpc_get_config_count), + lib_symbol_def(kpc_get_counter_count), + lib_symbol_def(kpc_set_config), + lib_symbol_def(kpc_get_config), + lib_symbol_def(kpc_get_cpu_counters), + lib_symbol_def(kpc_get_thread_counters), + lib_symbol_def(kpc_force_all_ctrs_set), + lib_symbol_def(kpc_force_all_ctrs_get), + lib_symbol_def(kperf_action_count_set), + lib_symbol_def(kperf_action_count_get), + lib_symbol_def(kperf_action_samplers_set), + lib_symbol_def(kperf_action_samplers_get), + lib_symbol_def(kperf_action_filter_set_by_task), + lib_symbol_def(kperf_action_filter_set_by_pid), + lib_symbol_def(kperf_timer_count_set), + lib_symbol_def(kperf_timer_count_get), + lib_symbol_def(kperf_timer_period_set), + lib_symbol_def(kperf_timer_period_get), + lib_symbol_def(kperf_timer_action_set), + lib_symbol_def(kperf_timer_action_get), + lib_symbol_def(kperf_sample_set), + lib_symbol_def(kperf_sample_get), + lib_symbol_def(kperf_reset), + lib_symbol_def(kperf_timer_pet_set), + lib_symbol_def(kperf_timer_pet_get), + lib_symbol_def(kperf_ns_to_ticks), + lib_symbol_def(kperf_ticks_to_ns), + lib_symbol_def(kperf_tick_frequency), +}; + +static const lib_symbol lib_symbols_kperfdata[] = { + lib_symbol_def(kpep_config_create), + lib_symbol_def(kpep_config_free), + lib_symbol_def(kpep_config_add_event), + lib_symbol_def(kpep_config_remove_event), + lib_symbol_def(kpep_config_force_counters), + lib_symbol_def(kpep_config_events_count), + lib_symbol_def(kpep_config_events), + lib_symbol_def(kpep_config_kpc), + lib_symbol_def(kpep_config_kpc_count), + lib_symbol_def(kpep_config_kpc_classes), + lib_symbol_def(kpep_config_kpc_map), + lib_symbol_def(kpep_db_create), + lib_symbol_def(kpep_db_free), + lib_symbol_def(kpep_db_name), + lib_symbol_def(kpep_db_aliases_count), + lib_symbol_def(kpep_db_aliases), + lib_symbol_def(kpep_db_counters_count), + lib_symbol_def(kpep_db_events_count), + lib_symbol_def(kpep_db_events), + lib_symbol_def(kpep_db_event), + lib_symbol_def(kpep_event_name), + lib_symbol_def(kpep_event_alias), + lib_symbol_def(kpep_event_description), +}; + +#define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf" +#define lib_path_kperfdata \ + "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata" + +static bool lib_inited = false; +static bool lib_has_err = false; +static char lib_err_msg[256]; + +static void *lib_handle_kperf = NULL; +static void *lib_handle_kperfdata = NULL; + +static void lib_deinit(void) { + lib_inited = false; + lib_has_err = false; + if (lib_handle_kperf) + dlclose(lib_handle_kperf); + if (lib_handle_kperfdata) + dlclose(lib_handle_kperfdata); + lib_handle_kperf = NULL; + lib_handle_kperfdata = NULL; + for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { + const lib_symbol *symbol = &lib_symbols_kperf[i]; + *symbol->impl = NULL; + } + for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { + const lib_symbol *symbol = &lib_symbols_kperfdata[i]; + *symbol->impl = NULL; + } +} + +static bool lib_init(void) { +#define return_err() \ + do { \ + lib_deinit(); \ + lib_inited = true; \ + lib_has_err = true; \ + return false; \ + } while (false) + + if (lib_inited) + return !lib_has_err; + + // load dynamic library + lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY); + if (!lib_handle_kperf) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperf.framework, message: %s.", dlerror()); + return_err(); + } + lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY); + if (!lib_handle_kperfdata) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperfdata.framework, message: %s.", dlerror()); + return_err(); + } + + // load symbol address from dynamic library + for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { + const lib_symbol *symbol = &lib_symbols_kperf[i]; + *symbol->impl = dlsym(lib_handle_kperf, symbol->name); + if (!*symbol->impl) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperf function: %s.", symbol->name); + return_err(); + } + } + for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { + const lib_symbol *symbol = &lib_symbols_kperfdata[i]; + *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name); + if (!*symbol->impl) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperfdata function: %s.", symbol->name); + return_err(); + } + } + + lib_inited = true; + lib_has_err = false; + return true; + +#undef return_err +} + +// ----------------------------------------------------------------------------- +// kdebug private structs +// https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h +// ----------------------------------------------------------------------------- + +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__arm64__) +typedef uint64_t kd_buf_argtype; +#else +typedef uintptr_t kd_buf_argtype; +#endif + +typedef struct { + uint64_t timestamp; + kd_buf_argtype arg1; + kd_buf_argtype arg2; + kd_buf_argtype arg3; + kd_buf_argtype arg4; + kd_buf_argtype arg5; /* the thread ID */ + uint32_t debugid; /* see */ + +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__LP64__) || defined(__arm64__) + uint32_t cpuid; /* cpu index, from 0 */ + kd_buf_argtype unused; +#endif +} kd_buf; + +/* bits for the type field of kd_regtype */ +#define KDBG_CLASSTYPE 0x10000 +#define KDBG_SUBCLSTYPE 0x20000 +#define KDBG_RANGETYPE 0x40000 +#define KDBG_TYPENONE 0x80000 +#define KDBG_CKTYPES 0xF0000 + +/* only trace at most 4 types of events, at the code granularity */ +#define KDBG_VALCHECK 0x00200000U + +typedef struct { + unsigned int type; + unsigned int value1; + unsigned int value2; + unsigned int value3; + unsigned int value4; +} kd_regtype; + +typedef struct { + /* number of events that can fit in the buffers */ + int nkdbufs; + /* set if trace is disabled */ + int nolog; + /* kd_ctrl_page.flags */ + unsigned int flags; + /* number of threads in thread map */ + int nkdthreads; + /* the owning pid */ + int bufid; +} kbufinfo_t; + +// ----------------------------------------------------------------------------- +// kdebug utils +// ----------------------------------------------------------------------------- + +/// Clean up trace buffers and reset ktrace/kdebug/kperf. +/// @return 0 on success. +static int kdebug_reset(void) { + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE}; + return sysctl(mib, 3, NULL, NULL, NULL, 0); +} + +/// Disable and reinitialize the trace buffers. +/// @return 0 on success. +static int kdebug_reinit(void) { + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETUP}; + return sysctl(mib, 3, NULL, NULL, NULL, 0); +} + +/// Set debug filter. +static int kdebug_setreg(kd_regtype *kdr) { + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETREG}; + usize size = sizeof(kd_regtype); + return sysctl(mib, 3, kdr, &size, NULL, 0); +} + +/// Set maximum number of trace entries (kd_buf). +/// Only allow allocation up to half the available memory (sane_size). +/// @return 0 on success. +static int kdebug_trace_setbuf(int nbufs) { + int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, nbufs}; + return sysctl(mib, 4, NULL, NULL, NULL, 0); +} + +/// Enable or disable kdebug trace. +/// Trace buffer must already be initialized. +/// @return 0 on success. +static int kdebug_trace_enable(bool enable) { + int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, enable}; + return sysctl(mib, 4, NULL, 0, NULL, 0); +} + +/// Retrieve trace buffer information from kernel. +/// @return 0 on success. +static int kdebug_get_bufinfo(kbufinfo_t *info) { + if (!info) + return -1; + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDGETBUF}; + size_t needed = sizeof(kbufinfo_t); + return sysctl(mib, 3, info, &needed, NULL, 0); +} + +/// Retrieve trace buffers from kernel. +/// @param buf Memory to receive buffer data, array of `kd_buf`. +/// @param len Length of `buf` in bytes. +/// @param count Number of trace entries (kd_buf) obtained. +/// @return 0 on success. +static int kdebug_trace_read(void *buf, usize len, usize *count) { + if (count) + *count = 0; + if (!buf || !len) + return -1; + + // Note: the input and output units are not the same. + // input: bytes + // output: number of kd_buf + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREADTR}; + int ret = sysctl(mib, 3, buf, &len, NULL, 0); + if (ret != 0) + return ret; + *count = len; + return 0; +} + +/// Block until there are new buffers filled or `timeout_ms` have passed. +/// @param timeout_ms timeout milliseconds, 0 means wait forever. +/// @param suc set true if new buffers filled. +/// @return 0 on success. +static int kdebug_wait(usize timeout_ms, bool *suc) { + if (timeout_ms == 0) + return -1; + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDBUFWAIT}; + usize val = timeout_ms; + int ret = sysctl(mib, 3, NULL, &val, NULL, 0); + if (suc) + *suc = !!val; + return ret; +} + +// ----------------------------------------------------------------------------- +// Demo +// ----------------------------------------------------------------------------- + +#define EVENT_NAME_MAX 8 +typedef struct { + const char *alias; /// name for print + const char *names[EVENT_NAME_MAX]; /// name from pmc db +} event_alias; + +/// Event names from /usr/share/kpep/.plist +static const event_alias profile_events[] = { + {"cycles", + { + "FIXED_CYCLES", // Apple A7-A15 + "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th + "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom + }}, + {"instructions", + { + "FIXED_INSTRUCTIONS", // Apple A7-A15 + "INST_RETIRED.ANY" // Intel Yonah, Merom, Core 1th-10th + }}, + {"branches", + { + "INST_BRANCH", // Apple A7-A15 + "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th + "INST_RETIRED.ANY", // Intel Yonah, Merom + }}, + {"branch-misses", + { + "BRANCH_MISPRED_NONSPEC", // Apple A7-A15, since iOS 15, macOS 12 + "BRANCH_MISPREDICT", // Apple A7-A14 + "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th + "BR_INST_RETIRED.MISPRED", // Intel Yonah, Merom + }}, +}; + +static kpep_event *get_event(kpep_db *db, const event_alias *alias) { + for (usize j = 0; j < EVENT_NAME_MAX; j++) { + const char *name = alias->names[j]; + if (!name) + break; + kpep_event *ev = NULL; + if (kpep_db_event(db, name, &ev) == 0) { + return ev; + } + } + return NULL; +} + +kpc_config_t regs[KPC_MAX_COUNTERS] = {0}; +usize counter_map[KPC_MAX_COUNTERS] = {0}; +u64 counters_0[KPC_MAX_COUNTERS] = {0}; +u64 counters_1[KPC_MAX_COUNTERS] = {0}; +const usize ev_count = sizeof(profile_events) / sizeof(profile_events[0]); diff --git a/microbenchmarks/performancecounters/linux-perf-events.h b/microbenchmarks/performancecounters/linux-perf-events.h new file mode 100644 index 000000000..494aeb738 --- /dev/null +++ b/microbenchmarks/performancecounters/linux-perf-events.h @@ -0,0 +1,101 @@ +// https://github.com/WojciechMula/toys/blob/master/000helpers/linux-perf-events.h +#pragma once +#ifdef __linux__ + +#include // for __NR_perf_event_open +#include // for perf event constants +#include // for ioctl +#include // for syscall + +#include // for errno +#include // for memset +#include + +#include +#include + +template +class LinuxEvents { + int fd; + bool working; + perf_event_attr attribs{}; + size_t num_events{}; + std::vector temp_result_vec{}; + std::vector ids{}; + + public: + explicit LinuxEvents(std::vector config_vec) : fd(0), working(true) { + memset(&attribs, 0, sizeof(attribs)); + attribs.type = TYPE; + attribs.size = sizeof(attribs); + attribs.disabled = 1; + attribs.exclude_kernel = 1; + attribs.exclude_hv = 1; + + attribs.sample_period = 0; + attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; + const int pid = 0; // the current process + const int cpu = -1; // all CPUs + const unsigned long flags = 0; + + int group = -1; // no group + num_events = config_vec.size(); + ids.resize(config_vec.size()); + uint32_t i = 0; + for (auto config : config_vec) { + attribs.config = config; + fd = static_cast( + syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags)); + if (fd == -1) { + report_error("perf_event_open"); + } + ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]); + if (group == -1) { + group = fd; + } + } + + temp_result_vec.resize(num_events * 2 + 1); + } + + ~LinuxEvents() { + if (fd != -1) { + close(fd); + } + } + + inline void start() { + if (fd != -1) { + if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) { + report_error("ioctl(PERF_EVENT_IOC_RESET)"); + } + + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) { + report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); + } + } + } + + inline void end(std::vector &results) { + if (fd != -1) { + if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) { + report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); + } + + if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) { + report_error("read"); + } + } + // our actual results are in slots 1,3,5, ... of this structure + // we really should be checking our ids obtained earlier to be safe + for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) { + results[i / 2] = temp_result_vec[i]; + } + } + + bool is_working() { return working; } + + private: + void report_error(const std::string &) { working = false; } +}; +#endif From 450a463d8e6c7f25062b214d240dd5bf0ac1631e Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 1 Apr 2023 19:19:34 +0000 Subject: [PATCH 03/10] Various fixes --- CMakeLists.txt | 9 +++++---- include/roaring/isadetection.h | 8 +++++--- include/roaring/misc/configreport.h | 2 ++ src/CMakeLists.txt | 30 ++++++++++++++++------------- tests/cbitset_unit.c | 4 ++-- tools/cmake/FindCTargets.cmake | 8 ++++---- 6 files changed, 35 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 21011042f..835672c87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(ROARING_DISABLE_X64 "Forcefully disable x64 optimizations even if hardwar option(ROARING_DISABLE_AVX "Forcefully disable AVX even if hardware supports it " OFF) option(ROARING_DISABLE_NEON "Forcefully disable NEON even if hardware supports it" OFF) option(ROARING_DISABLE_NATIVE "Forcefully disable -march optimizations (obsolete)" OFF) +option(ROARING_DISABLE_AVX512 "Forcefully disable AVX512 even if compiler supports it" OFF) option(ROARING_BUILD_STATIC "Build a static library" ON) if(BUILD_SHARED_LIBS) @@ -62,7 +63,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/roaring.pc" DESTINATION ${CMAKE_INSTA add_library(roaring-headers INTERFACE) target_include_directories(roaring-headers INTERFACE - $ + $ $) add_library(roaring-headers-cpp INTERFACE) target_include_directories(roaring-headers-cpp INTERFACE @@ -73,11 +74,11 @@ target_include_directories(roaring-headers-cpp INTERFACE ### Some users want the C++ header files to be installed as well. ### C++ header files get installed to /usr/local/include/roaring typically SET(CPP_ROARING_HEADERS cpp/roaring64map.hh cpp/roaring.hh) # needs to be updated if we add more files -install(FILES ${CPP_ROARING_HEADERS} DESTINATION include/${ROARING_LIB_NAME}) -install(DIRECTORY include/${ROARING_LIB_NAME} DESTINATION include) +install(FILES ${CPP_ROARING_HEADERS} DESTINATION include/roaring) +install(DIRECTORY include/roaring DESTINATION include) install(TARGETS roaring-headers roaring-headers-cpp - EXPORT ${ROARING_LIB_NAME}-config + EXPORT roaring-config ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} diff --git a/include/roaring/isadetection.h b/include/roaring/isadetection.h index 39d1d6621..356b4cd9a 100644 --- a/include/roaring/isadetection.h +++ b/include/roaring/isadetection.h @@ -52,7 +52,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include - +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 #ifdef __has_include // We want to make sure that the AVX-512 functions are only built on compilers // fully supporting AVX-512. @@ -67,6 +67,7 @@ POSSIBILITY OF SUCH DAMAGE. #define CROARING_COMPILER_SUPPORTS_AVX512 1 #endif #endif +#endif // We need portability.h to be included first, see // https://github.com/RoaringBitmap/CRoaring/issues/394 @@ -96,15 +97,15 @@ enum croaring_instruction_set { CROARING_UNINITIALIZED = 0x8000 }; +#if CROARING_COMPILER_SUPPORTS_AVX512 static unsigned int CROARING_AVX512_REQUIRED = (CROARING_AVX512F | CROARING_AVX512DQ | CROARING_AVX512BW | CROARING_AVX512VBMI2 | CROARING_AVX512BITALG | CROARING_AVX512VPOPCNTDQ); - +#endif #if defined(__x86_64__) || defined(_M_AMD64) // x64 static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { - #if CROARING_REGULAR_VISUAL_STUDIO int cpu_info[4]; __cpuid(cpu_info, *eax); @@ -131,6 +132,7 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, * as one compilation unit. */ static inline uint32_t dynamic_croaring_detect_supported_architectures() { +printf("dynamic_croaring_detect_supported_architectures\n"); uint32_t eax, ebx, ecx, edx; uint32_t host_isa = 0x0; // Can be found on Intel ISA Reference for CPUID diff --git a/include/roaring/misc/configreport.h b/include/roaring/misc/configreport.h index 01974a77b..04f0cea89 100644 --- a/include/roaring/misc/configreport.h +++ b/include/roaring/misc/configreport.h @@ -191,12 +191,14 @@ static inline void tellmeall() { printf( "AVX2 not used\t"); } } +#if CROARING_COMPILER_SUPPORTS_AVX512 if((config & CROARING_AVX512_REQUIRED) == CROARING_AVX512_REQUIRED) { printf( "AVX-512 detected\t"); if(!croaring_avx2()) { printf( "AVX-512 not used\t"); } } +#endif if((config & CROARING_SSE42) == CROARING_SSE42) { printf(" SSE4.2 detected\t"); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7b3e0d02a..ff7e069d3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -39,34 +39,38 @@ if(ROARING_BUILD_C_AS_CPP) # more checks and tools, e.g. analysis SET_SOURCE_FILES_PROPERTIES(${ROARING_SRC} PROPERTIES LANGUAGE CXX) endif() -add_library(${ROARING_LIB_NAME} ${ROARING_LIB_TYPE} ${ROARING_SRC}) -target_include_directories(${ROARING_LIB_NAME} +add_library(roaring ${ROARING_LIB_TYPE} ${ROARING_SRC}) +if(ROARING_DISABLE_AVX512) + target_compile_definitions(roaring PUBLIC CROARING_COMPILER_SUPPORTS_AVX512=0) +endif(ROARING_DISABLE_AVX512) + +target_include_directories(roaring PUBLIC $ $ ) -target_link_libraries(${ROARING_LIB_NAME} PUBLIC roaring-headers) -target_link_libraries(${ROARING_LIB_NAME} PUBLIC roaring-headers-cpp) +target_link_libraries(roaring PUBLIC roaring-headers) +target_link_libraries(roaring PUBLIC roaring-headers-cpp) # -#install(TARGETS ${ROARING_LIB_NAME} DESTINATION lib) +#install(TARGETS roaring DESTINATION lib) # -install(TARGETS ${ROARING_LIB_NAME} - EXPORT ${ROARING_LIB_NAME}-config +install(TARGETS roaring + EXPORT roaring-config ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} INCLUDES DESTINATION ${CMAKE_INSTALL_INCDIR} ) -install(EXPORT ${ROARING_LIB_NAME}-config - FILE ${ROARING_LIB_NAME}-config.cmake - NAMESPACE ${ROARING_LIB_NAME}:: - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${ROARING_LIB_NAME} +install(EXPORT roaring-config + FILE roaring-config.cmake + NAMESPACE roaring:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/roaring ) if(NOT MSVC) ## We output the library at the root of the current directory where cmake is invoked ## This is handy but Visual Studio will happily ignore us -set_target_properties(${ROARING_LIB_NAME} PROPERTIES +set_target_properties(roaring PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} VERSION ${ROARING_LIB_VERSION} SOVERSION ${ROARING_LIB_SOVERSION}) @@ -78,6 +82,6 @@ if(MSVC AND (ROARING_LIB_TYPE STREQUAL "SHARED")) MESSAGE( STATUS "To build a Windows DLL using Visual Studio, you may need cmake 3.4 or better." ) endif() MESSAGE( STATUS "Building a Windows DLL using Visual Studio, exporting all symbols automatically." ) - set_target_properties(${ROARING_LIB_NAME} + set_target_properties(roaring PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS 1) endif() diff --git a/tests/cbitset_unit.c b/tests/cbitset_unit.c index 512854410..d1b3f8edd 100644 --- a/tests/cbitset_unit.c +++ b/tests/cbitset_unit.c @@ -76,10 +76,10 @@ void test_shift_left() { for (size_t k = s1; k < s2; ++k) { bitset_set(b, power * k); } - size_t mycount = bitset_count(b); + int mycount = bitset_count(b); assert_true(compute_cardinality(b) == mycount); bitset_shift_left(b, sh); - assert_true(bitset_count(b) == mycount); + assert_true(bitset_count(b) == (size_t)mycount); assert_true(compute_cardinality(b) == mycount); for (size_t k = s1; k < s2; ++k) { assert_true(bitset_get(b, power * k + sh)); diff --git a/tools/cmake/FindCTargets.cmake b/tools/cmake/FindCTargets.cmake index a65821424..d9330dc83 100644 --- a/tools/cmake/FindCTargets.cmake +++ b/tools/cmake/FindCTargets.cmake @@ -13,7 +13,7 @@ function(add_c_test TEST_NAME) add_executable(${TEST_NAME} ${TEST_NAME}.c) - target_link_libraries(${TEST_NAME} ${ROARING_LIB_NAME} cmocka::cmocka) + target_link_libraries(${TEST_NAME} roaring cmocka::cmocka) add_test(${TEST_NAME} ${TEST_NAME}) endfunction(add_c_test) @@ -29,7 +29,7 @@ if (CMAKE_VERSION VERSION_GREATER 2.8.10) endif() target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/cpp) - target_link_libraries(${TEST_NAME} ${ROARING_LIB_NAME} cmocka::cmocka) + target_link_libraries(${TEST_NAME} roaring cmocka::cmocka) add_test(${TEST_NAME} ${TEST_NAME}) endfunction(add_cpp_test) @@ -41,12 +41,12 @@ endif() function(add_c_benchmark BENCH_NAME) add_executable(${BENCH_NAME} ${BENCH_NAME}.c) - target_link_libraries(${BENCH_NAME} ${ROARING_LIB_NAME}) + target_link_libraries(${BENCH_NAME} roaring) endfunction(add_c_benchmark) function(add_cpp_benchmark BENCH_NAME) add_executable(${BENCH_NAME} ${BENCH_NAME}.cpp) - target_link_libraries(${BENCH_NAME} ${ROARING_LIB_NAME}) + target_link_libraries(${BENCH_NAME} roaring) if(ROARING_EXCEPTIONS) target_compile_definitions(${BENCH_NAME} PUBLIC ROARING_EXCEPTIONS=1) else() From 0fcb1e7170c443ce46f48f94a96c6fd94909083b Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 1 Apr 2023 21:10:27 +0000 Subject: [PATCH 04/10] Tweaks. --- amalgamation.sh | 1 - include/roaring/bitset_util.h | 8 +- include/roaring/isadetection.h | 279 ++-------------------------- include/roaring/misc/configreport.h | 37 +--- microbenchmarks/bench.cpp | 74 ++++++-- microbenchmarks/bench.h | 6 + src/CMakeLists.txt | 1 + src/array_util.c | 13 +- src/bitset_util.c | 12 +- src/containers/array.c | 10 +- src/containers/bitset.c | 56 +++--- src/containers/convert.c | 8 +- src/containers/run.c | 10 +- src/isadetection.c | 268 ++++++++++++++++++++++++++ src/roaring.c | 1 - 15 files changed, 430 insertions(+), 354 deletions(-) create mode 100644 src/isadetection.c diff --git a/amalgamation.sh b/amalgamation.sh index 161582e7a..e35e5bd6c 100755 --- a/amalgamation.sh +++ b/amalgamation.sh @@ -82,7 +82,6 @@ ALL_PRIVATE_C=$( ( \ && ( type git >/dev/null 2>&1 ) \ && ( git ls-files $SCRIPTPATH/src/*.c $SCRIPTPATH/src/**/*c ) \ ) || ( find $SCRIPTPATH/src -name '*.c' ) ) - # Verify up-front that all the files exist # for i in ${ALL_PUBLIC_H} ${ALL_PUBLIC_HH} ${ALL_PRIVATE_H} ${ALL_PRIVATE_C}; do diff --git a/include/roaring/bitset_util.h b/include/roaring/bitset_util.h index 02b5bdd3f..6b5207f96 100644 --- a/include/roaring/bitset_util.h +++ b/include/roaring/bitset_util.h @@ -6,6 +6,12 @@ #include #include +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + #ifdef __cplusplus extern "C" { namespace roaring { namespace internal { #endif @@ -631,7 +637,7 @@ CROARING_UNTARGET_AVX512 __m512i total = _mm512_setzero_si512(); \ const uint64_t limit = size - size % 4; \ uint64_t i = 0; \ - for (; i < limit; i += 4) { \ + for (; i < limit; i += 4) { \ __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i), \ _mm512_loadu_si512(data2 + i)); \ total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1)); \ diff --git a/include/roaring/isadetection.h b/include/roaring/isadetection.h index 356b4cd9a..27677d22d 100644 --- a/include/roaring/isadetection.h +++ b/include/roaring/isadetection.h @@ -1,56 +1,9 @@ -/* From -https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h -Highly modified. - -Copyright (c) 2016- Facebook, Inc (Adam Paszke) -Copyright (c) 2014- Facebook, Inc (Soumith Chintala) -Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) -Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) -Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) -Copyright (c) 2011-2013 NYU (Clement Farabet) -Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, -Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute -(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, -Samy Bengio, Johnny Mariethoz) - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories -America and IDIAP Research Institute nor the names of its contributors may be - used to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. -*/ - #ifndef ROARING_ISADETECTION_H #define ROARING_ISADETECTION_H +#if defined(__x86_64__) || defined(_M_AMD64) // x64 + -// isadetection.h does not define any macro (except for ROARING_ISADETECTION_H). -#include -#include -#include #ifndef CROARING_COMPILER_SUPPORTS_AVX512 #ifdef __has_include @@ -69,225 +22,17 @@ POSSIBILITY OF SUCH DAMAGE. #endif #endif -// We need portability.h to be included first, see -// https://github.com/RoaringBitmap/CRoaring/issues/394 -#include -#if CROARING_REGULAR_VISUAL_STUDIO -#include -#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) -#include -#endif // CROARING_REGULAR_VISUAL_STUDIO - - -enum croaring_instruction_set { - CROARING_DEFAULT = 0x0, - CROARING_NEON = 0x1, - CROARING_AVX2 = 0x4, - CROARING_SSE42 = 0x8, - CROARING_PCLMULQDQ = 0x10, - CROARING_BMI1 = 0x20, - CROARING_BMI2 = 0x40, - CROARING_ALTIVEC = 0x80, - CROARING_AVX512F = 0x100, - CROARING_AVX512DQ = 0x200, - CROARING_AVX512BW = 0x400, - CROARING_AVX512VBMI2 = 0x800, - CROARING_AVX512BITALG = 0x1000, - CROARING_AVX512VPOPCNTDQ = 0x2000, - CROARING_UNINITIALIZED = 0x8000 -}; - -#if CROARING_COMPILER_SUPPORTS_AVX512 -static unsigned int CROARING_AVX512_REQUIRED = (CROARING_AVX512F | CROARING_AVX512DQ | CROARING_AVX512BW | CROARING_AVX512VBMI2 | CROARING_AVX512BITALG | CROARING_AVX512VPOPCNTDQ); -#endif - -#if defined(__x86_64__) || defined(_M_AMD64) // x64 - - -static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, - uint32_t *edx) { -#if CROARING_REGULAR_VISUAL_STUDIO - int cpu_info[4]; - __cpuid(cpu_info, *eax); - *eax = cpu_info[0]; - *ebx = cpu_info[1]; - *ecx = cpu_info[2]; - *edx = cpu_info[3]; -#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) - uint32_t level = *eax; - __get_cpuid(level, eax, ebx, ecx, edx); -#else - uint32_t a = *eax, b, c = *ecx, d; - __asm__("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); - *eax = a; - *ebx = b; - *ecx = c; - *edx = d; -#endif -} - -/** - * This is a relatively expensive function but it will get called at most - * *once* per compilation units. Normally, the CRoaring library is built - * as one compilation unit. - */ -static inline uint32_t dynamic_croaring_detect_supported_architectures() { -printf("dynamic_croaring_detect_supported_architectures\n"); - uint32_t eax, ebx, ecx, edx; - uint32_t host_isa = 0x0; - // Can be found on Intel ISA Reference for CPUID - static uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7 - static uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7 - static uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7 - static uint32_t cpuid_avx512f_bit = 1 << 16; ///< @private bit 16 of EBX for EAX=0x7 - static uint32_t cpuid_avx512dq_bit = 1 << 17; ///< @private bit 17 of EBX for EAX=0x7 - static uint32_t cpuid_avx512bw_bit = 1 << 30; ///< @private bit 30 of EBX for EAX=0x7 - static uint32_t cpuid_avx512vbmi2_bit = 1 << 6; ///< @private bit 6 of ECX for EAX=0x7 - static uint32_t cpuid_avx512bitalg_bit = 1 << 12; ///< @private bit 12 of ECX for EAX=0x7 - static uint32_t cpuid_avx512vpopcntdq_bit = 1 << 14; ///< @private bit 14 of ECX for EAX=0x7 - static uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1 - static uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1 - // ECX for EAX=0x7 - eax = 0x7; - ecx = 0x0; - cpuid(&eax, &ebx, &ecx, &edx); - if (ebx & cpuid_avx2_bit) { - host_isa |= CROARING_AVX2; - } - if (ebx & cpuid_bmi1_bit) { - host_isa |= CROARING_BMI1; - } - - if (ebx & cpuid_bmi2_bit) { - host_isa |= CROARING_BMI2; - } - - if (ebx & cpuid_avx512f_bit) { - host_isa |= CROARING_AVX512F; - } - - if (ebx & cpuid_avx512bw_bit) { - host_isa |= CROARING_AVX512BW; - } - - if (ebx & cpuid_avx512dq_bit) { - host_isa |= CROARING_AVX512DQ; - } - - if (ecx & cpuid_avx512vbmi2_bit) { - host_isa |= CROARING_AVX512VBMI2; - } - - if (ecx & cpuid_avx512bitalg_bit) { - host_isa |= CROARING_AVX512BITALG; - } - - if (ecx & cpuid_avx512vpopcntdq_bit) { - host_isa |= CROARING_AVX512VPOPCNTDQ; - } - - // EBX for EAX=0x1 - eax = 0x1; - cpuid(&eax, &ebx, &ecx, &edx); - - if (ecx & cpuid_sse42_bit) { - host_isa |= CROARING_SSE42; - } - - if (ecx & cpuid_pclmulqdq_bit) { - host_isa |= CROARING_PCLMULQDQ; - } - - return host_isa; -} - -#endif // end SIMD extension detection code - -#if defined(__x86_64__) || defined(_M_AMD64) // x64 - -#if defined(__cplusplus) -static inline uint32_t croaring_detect_supported_architectures() { - // thread-safe as per the C++11 standard. - static uint32_t buffer = dynamic_croaring_detect_supported_architectures(); - return buffer; -} -#elif CROARING_VISUAL_STUDIO -// Visual Studio does not support C11 atomics. -static inline uint32_t croaring_detect_supported_architectures() { - static int buffer = CROARING_UNINITIALIZED; - if (buffer == CROARING_UNINITIALIZED) { - buffer = dynamic_croaring_detect_supported_architectures(); - } - return buffer; -} -#else // CROARING_VISUAL_STUDIO -#include -static inline uint32_t croaring_detect_supported_architectures() { - // we use an atomic for thread safety - static _Atomic uint32_t buffer = CROARING_UNINITIALIZED; - if (buffer == CROARING_UNINITIALIZED) { - // atomicity is sufficient - buffer = dynamic_croaring_detect_supported_architectures(); - } - return buffer; -} -#endif // CROARING_REGULAR_VISUAL_STUDIO - -#ifdef ROARING_DISABLE_AVX -static inline bool croaring_avx2() { - return false; -} -static inline bool croaring_avx512() { - return false; -} -#elif defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VBMI2__) && defined(__AVX512BITALG__) && defined(__AVX512VPOPCNTDQ__) -static inline bool croaring_avx2() { - return true; -} -static inline bool croaring_avx512() { - return true; -} -#elif defined(__AVX2__) -static inline bool croaring_avx2() { - return true; -} -static inline bool croaring_avx512() { -#if CROARING_COMPILER_SUPPORTS_AVX512 - // Even though we have set __AVX2__ at compile-time, it is still possible for the hardware - // to support AVX-512. By setting __AVX2__, all we are saying is that croaring_avx2() must be true! - static bool avx512_support = false; - - if( !avx512_support ) - { - avx512_support = ( (croaring_detect_supported_architectures() & CROARING_AVX512_REQUIRED) - == CROARING_AVX512_REQUIRED); - } - return avx512_support; -#else - return false; +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { #endif -} -#else -static inline bool croaring_avx2() { - return (croaring_detect_supported_architectures() & CROARING_AVX2) == CROARING_AVX2; -} -static inline bool croaring_avx512() { -#if CROARING_COMPILER_SUPPORTS_AVX512 - static bool avx512_support = false; - - if( !avx512_support ) - { - avx512_support = ( (croaring_detect_supported_architectures() & CROARING_AVX512_REQUIRED) - == CROARING_AVX512_REQUIRED); - } - return avx512_support; -#else - return false; -#endif -} +enum { + ROARING_SUPPORTS_AVX2 = 1, + ROARING_SUPPORTS_AVX512 = 2, +}; +int croaring_hardware_support(); +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { #endif - -#endif // defined(__x86_64__) || defined(_M_AMD64) // x64 - +#endif // x64 #endif // ROARING_ISADETECTION_H diff --git a/include/roaring/misc/configreport.h b/include/roaring/misc/configreport.h index 04f0cea89..87a6aae8a 100644 --- a/include/roaring/misc/configreport.h +++ b/include/roaring/misc/configreport.h @@ -1,7 +1,7 @@ /* * configreport.h * If this gets compiled into a different execution unit than the CRoaring library, - * the functions croaring_avx512() and croaring_avx2() *may* trigger an additional + * the functions croaring_hardware_support() & ROARING_SUPPORTS_AVX512 and croaring_hardware_support() & ROARING_SUPPORTS_AVX2 *may* trigger an additional * call to dynamic_croaring_detect_supported_architectures(). */ #ifndef INCLUDE_MISC_CONFIGREPORT_H_ @@ -172,42 +172,11 @@ static inline void tellmeall() { #ifdef __VERSION__ printf(" compiler version: %s\t", __VERSION__); #endif - uint32_t config = croaring_detect_supported_architectures(); - if((config & CROARING_NEON) == CROARING_NEON) { - printf(" NEON detected\t"); - } + #ifdef __AVX2__ printf(" Building for AVX2\t"); #endif - if(croaring_avx512()) { - printf( "AVX-512\t"); - } - if(croaring_avx2()) { - printf( "AVX2\t"); - } - if((config & CROARING_AVX2) == CROARING_AVX2) { - printf( "AVX2 detected\t"); - if(!croaring_avx2()) { - printf( "AVX2 not used\t"); - } - } -#if CROARING_COMPILER_SUPPORTS_AVX512 - if((config & CROARING_AVX512_REQUIRED) == CROARING_AVX512_REQUIRED) { - printf( "AVX-512 detected\t"); - if(!croaring_avx2()) { - printf( "AVX-512 not used\t"); - } - } -#endif - if((config & CROARING_SSE42) == CROARING_SSE42) { - printf(" SSE4.2 detected\t"); - } - if((config & CROARING_BMI1) == CROARING_BMI1) { - printf(" BMI1 detected\t"); - } - if((config & CROARING_BMI2) == CROARING_BMI2) { - printf(" BMI2 detected\t"); - } + printf("\n"); if ((sizeof(int) != 4) || (sizeof(long) != 8)) { printf("number of bytes: int = %lu long = %lu \n", diff --git a/microbenchmarks/bench.cpp b/microbenchmarks/bench.cpp index 2ea016d2d..1616e5b3d 100644 --- a/microbenchmarks/bench.cpp +++ b/microbenchmarks/bench.cpp @@ -1,17 +1,5 @@ #include "bench.h" -struct compute_cardinality { - static uint64_t run() { - uint64_t marker = 0; - for (size_t i = 0; i < count; ++i) { - marker += roaring_bitmap_get_cardinality(bitmaps[i]); - } - return marker; - } -}; - -auto ComputeCardinality = BasicBench; -BENCHMARK(ComputeCardinality); struct successive_intersection { static uint64_t run() { @@ -28,6 +16,44 @@ struct successive_intersection { auto SuccessiveIntersection = BasicBench; BENCHMARK(SuccessiveIntersection); + +struct successive_intersection_cardinality { + static uint64_t run() { + uint64_t marker = 0; + for (size_t i = 0; i + 1 < count; ++i) { + marker += roaring_bitmap_and_cardinality(bitmaps[i], bitmaps[i + 1]); + } + return marker; + } +}; +auto SuccessiveIntersectionCardinality = BasicBench; +BENCHMARK(SuccessiveIntersectionCardinality); + + +struct successive_union_cardinality { + static uint64_t run() { + uint64_t marker = 0; + for (size_t i = 0; i + 1 < count; ++i) { + marker += roaring_bitmap_or_cardinality(bitmaps[i], bitmaps[i + 1]); + } + return marker; + } +}; +auto SuccessiveUnionCardinality = BasicBench; +BENCHMARK(SuccessiveUnionCardinality); + +struct successive_difference_cardinality { + static uint64_t run() { + uint64_t marker = 0; + for (size_t i = 0; i + 1 < count; ++i) { + marker += roaring_bitmap_andnot_cardinality(bitmaps[i], bitmaps[i + 1]); + } + return marker; + } +}; +auto SuccessiveDifferenceCardinality = BasicBench; +BENCHMARK(SuccessiveDifferenceCardinality); + struct successive_union { static uint64_t run() { uint64_t marker = 0; @@ -114,6 +140,20 @@ struct iterate_all { auto IterateAll = BasicBench; BENCHMARK(IterateAll); + +struct compute_cardinality { + static uint64_t run() { + uint64_t marker = 0; + for (size_t i = 0; i < count; ++i) { + marker += roaring_bitmap_get_cardinality(bitmaps[i]); + } + return marker; + } +}; + +auto ComputeCardinality = BasicBench; +BENCHMARK(ComputeCardinality)->MinTime(2); + int main(int argc, char **argv) { const char *dir_name; if ((argc == 1) || (argc > 1 && argv[1][0] == '-')) { @@ -136,6 +176,16 @@ int main(int argc, char **argv) { "Unsupported system."); } #endif + +#if CROARING_IS_X64 + benchmark::AddCustomContext("x64", "detected"); + int support = roaring::internal::croaring_hardware_support(); +#if CROARING_COMPILER_SUPPORTS_AVX512 + benchmark::AddCustomContext("AVX-512", "supported by compiler"); + benchmark::AddCustomContext("AVX-512 hardware", ( support & roaring::internal::ROARING_SUPPORTS_AVX512 ) ? "yes" : "no"); +#endif // CROARING_COMPILER_SUPPORTS_AVX512 + benchmark::AddCustomContext("AVX-2 hardware", ( support & roaring::internal::ROARING_SUPPORTS_AVX2 ) ? "yes" : "no"); +#endif // CROARING_IS_X64 benchmark::AddCustomContext("data source", dir_name); benchmark::AddCustomContext("number of bitmaps", std::to_string(count)); diff --git a/microbenchmarks/bench.h b/microbenchmarks/bench.h index 755df5811..4f18212f7 100644 --- a/microbenchmarks/bench.h +++ b/microbenchmarks/bench.h @@ -15,6 +15,12 @@ #include "performancecounters/event_counter.h" // clang-format on +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + event_collector collector; size_t N = 1000; size_t bitmap_examples_bytes = 0; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ff7e069d3..8ad9d7835 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,6 +15,7 @@ endif() MESSAGE( STATUS "ROARING_LIB_TYPE: " ${ROARING_LIB_TYPE}) set(ROARING_SRC + isadetection.c array_util.c bitset_util.c bitset.c diff --git a/src/array_util.c b/src/array_util.c index cadb76821..867f50b0a 100644 --- a/src/array_util.c +++ b/src/array_util.c @@ -9,6 +9,12 @@ #include #include +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + #ifdef __cplusplus extern "C" { namespace roaring { namespace internal { #endif @@ -1953,7 +1959,7 @@ size_t union_uint32_card(const uint32_t *set_1, size_t size_1, size_t fast_union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2, size_t size_2, uint16_t *buffer) { #ifdef CROARING_IS_X64 - if( croaring_avx2() ) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { // compute union with smallest array first if (size_1 < size_2) { return union_vector16(set_1, (uint32_t)size_1, @@ -2092,12 +2098,13 @@ bool memequals(const void *s1, const void *s2, size_t n) { return true; } #ifdef CROARING_IS_X64 + int support = croaring_hardware_support(); #if CROARING_COMPILER_SUPPORTS_AVX512 - if( croaring_avx512() ) { + if( support & ROARING_SUPPORTS_AVX512 ) { return _avx512_memequals(s1, s2, n); } else #endif // CROARING_COMPILER_SUPPORTS_AVX512 - if( croaring_avx2() ) { + if( support & ROARING_SUPPORTS_AVX2 ) { return _avx2_memequals(s1, s2, n); } else { return memcmp(s1, s2, n) == 0; diff --git a/src/bitset_util.c b/src/bitset_util.c index 331c65620..c249560d1 100644 --- a/src/bitset_util.c +++ b/src/bitset_util.c @@ -6,6 +6,12 @@ #include +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + #ifdef __cplusplus extern "C" { namespace roaring { namespace api { #endif @@ -1006,7 +1012,7 @@ static inline void _scalar_bitset_set_list(uint64_t *words, const uint16_t *list uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list, uint64_t length) { - if( croaring_avx2() ) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { return _asm_bitset_clear_list(words, card, list, length); } else { return _scalar_bitset_clear_list(words, card, list, length); @@ -1015,7 +1021,7 @@ uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list, uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card, const uint16_t *list, uint64_t length) { - if( croaring_avx2() ) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { return _asm_bitset_set_list_withcard(words, card, list, length); } else { return _scalar_bitset_set_list_withcard(words, card, list, length); @@ -1023,7 +1029,7 @@ uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card, } void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length) { - if( croaring_avx2() ) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { _asm_bitset_set_list(words, list, length); } else { _scalar_bitset_set_list(words, list, length); diff --git a/src/containers/array.c b/src/containers/array.c index 8e3c053f2..199c20ceb 100644 --- a/src/containers/array.c +++ b/src/containers/array.c @@ -217,7 +217,7 @@ void array_container_andnot(const array_container_t *array_1, if (out->capacity < array_1->cardinality) array_container_grow(out, array_1->cardinality, false); #ifdef CROARING_IS_X64 - if(( croaring_avx2() ) && (out != array_1) && (out != array_2)) { + if(( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) && (out != array_1) && (out != array_2)) { out->cardinality = difference_vector16(array_1->array, array_1->cardinality, array_2->array, array_2->cardinality, out->array); @@ -248,7 +248,7 @@ void array_container_xor(const array_container_t *array_1, } #ifdef CROARING_IS_X64 - if( croaring_avx2() ) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { out->cardinality = xor_vector16(array_1->array, array_1->cardinality, array_2->array, array_2->cardinality, out->array); @@ -297,7 +297,7 @@ void array_container_intersection(const array_container_t *array1, array2->array, card_2, array1->array, card_1, out->array); } else { #ifdef CROARING_IS_X64 - if( croaring_avx2() ) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { out->cardinality = intersect_vector16( array1->array, card_1, array2->array, card_2, out->array); } else { @@ -325,7 +325,7 @@ int array_container_intersection_cardinality(const array_container_t *array1, array1->array, card_1); } else { #ifdef CROARING_IS_X64 - if( croaring_avx2() ) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { return intersect_vector16_cardinality(array1->array, card_1, array2->array, card_2); } else { @@ -371,7 +371,7 @@ void array_container_intersection_inplace(array_container_t *src_1, src_2->array, card_2, src_1->array, card_1, src_1->array); } else { #ifdef CROARING_IS_X64 - if (croaring_avx2()) { + if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) { src_1->cardinality = intersect_vector16_inplace( src_1->array, card_1, src_2->array, card_2); } else { diff --git a/src/containers/bitset.c b/src/containers/bitset.c index a20741117..86375ea0d 100644 --- a/src/containers/bitset.c +++ b/src/containers/bitset.c @@ -16,6 +16,12 @@ #include #include +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + #ifdef __cplusplus extern "C" { namespace roaring { namespace internal { #endif @@ -56,7 +62,8 @@ bitset_container_t *bitset_container_create(void) { size_t align_size = 32; #ifdef CROARING_IS_X64 - if ( croaring_avx512() ) { + int support = croaring_hardware_support(); + if ( support & ROARING_SUPPORTS_AVX512 ) { // sizeof(__m512i) == 64 align_size = 64; } @@ -131,7 +138,7 @@ bitset_container_t *bitset_container_clone(const bitset_container_t *src) { size_t align_size = 32; #ifdef CROARING_IS_X64 - if ( croaring_avx512() ) { + if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX512 ) { // sizeof(__m512i) == 64 align_size = 64; } @@ -257,14 +264,15 @@ static inline int _scalar_bitset_container_compute_cardinality(const bitset_cont } /* Get the number of bits set (force computation) */ int bitset_container_compute_cardinality(const bitset_container_t *bitset) { + int support = croaring_hardware_support(); #if CROARING_COMPILER_SUPPORTS_AVX512 - if( croaring_avx512() ) { + if( support & ROARING_SUPPORTS_AVX512 ) { return (int) avx512_vpopcount( (const __m512i *)bitset->words, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG)); } else #endif // CROARING_COMPILER_SUPPORTS_AVX512 - if( croaring_avx2() ) { + if( support & ROARING_SUPPORTS_AVX2 ) { return (int) avx2_harley_seal_popcount256( (const __m256i *)bitset->words, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG)); @@ -321,10 +329,7 @@ int bitset_container_compute_cardinality(const bitset_container_t *bitset) { #ifndef WORDS_IN_AVX512_REG #define WORDS_IN_AVX512_REG sizeof(__m512i) / sizeof(uint64_t) #endif // WORDS_IN_AVX512_REG -/*#define LOOP_SIZE \ - BITSET_CONTAINER_SIZE_IN_WORDS / \ - ((WORDS_IN_AVX512_REG)*BITSET_CONTAINER_FN_REPEAT) -*/ + /* Computes a binary operation (eg union) on bitset1 and bitset2 and write the result to bitsetout */ // clang-format off @@ -698,15 +703,15 @@ SCALAR_BITSET_CONTAINER_FN(xor, ^, _mm256_xor_si256, veorq_u64) SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64) #if CROARING_COMPILER_SUPPORTS_AVX512 - #define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic) \ int bitset_container_##opname(const bitset_container_t *src_1, \ const bitset_container_t *src_2, \ bitset_container_t *dst) { \ - if ( croaring_avx512() ) { \ + int support = croaring_hardware_support(); \ + if ( support & ROARING_SUPPORTS_AVX512 ) { \ return _avx512_bitset_container_##opname(src_1, src_2, dst); \ } \ - else if ( croaring_avx2() ) { \ + else if ( support & ROARING_SUPPORTS_AVX2 ) { \ return _avx2_bitset_container_##opname(src_1, src_2, dst); \ } else { \ return _scalar_bitset_container_##opname(src_1, src_2, dst); \ @@ -715,10 +720,11 @@ SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64) int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \ const bitset_container_t *src_2, \ bitset_container_t *dst) { \ - if ( croaring_avx512() ) { \ + int support = croaring_hardware_support(); \ + if ( support & ROARING_SUPPORTS_AVX512 ) { \ return _avx512_bitset_container_##opname##_nocard(src_1, src_2, dst); \ } \ - else if ( croaring_avx2() ) { \ + else if ( support & ROARING_SUPPORTS_AVX2 ) { \ return _avx2_bitset_container_##opname##_nocard(src_1, src_2, dst); \ } else { \ return _scalar_bitset_container_##opname##_nocard(src_1, src_2, dst); \ @@ -726,11 +732,11 @@ SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64) } \ int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \ const bitset_container_t *src_2) { \ - if ( croaring_avx512() ) { \ + int support = croaring_hardware_support(); \ + if ( support & ROARING_SUPPORTS_AVX512 ) { \ return _avx512_bitset_container_##opname##_justcard(src_1, src_2); \ } \ - else if ((croaring_detect_supported_architectures() & CROARING_AVX2) == \ - CROARING_AVX2) { \ + else if ( support & ROARING_SUPPORTS_AVX2 ) { \ return _avx2_bitset_container_##opname##_justcard(src_1, src_2); \ } else { \ return _scalar_bitset_container_##opname##_justcard(src_1, src_2); \ @@ -744,7 +750,7 @@ SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64) int bitset_container_##opname(const bitset_container_t *src_1, \ const bitset_container_t *src_2, \ bitset_container_t *dst) { \ - if ( croaring_avx2() ) { \ + if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { \ return _avx2_bitset_container_##opname(src_1, src_2, dst); \ } else { \ return _scalar_bitset_container_##opname(src_1, src_2, dst); \ @@ -753,7 +759,7 @@ SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64) int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \ const bitset_container_t *src_2, \ bitset_container_t *dst) { \ - if ( croaring_avx2() ) { \ + if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { \ return _avx2_bitset_container_##opname##_nocard(src_1, src_2, dst); \ } else { \ return _scalar_bitset_container_##opname##_nocard(src_1, src_2, dst); \ @@ -761,7 +767,7 @@ SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64) } \ int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \ const bitset_container_t *src_2) { \ - if ( croaring_avx2() ) { \ + if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { \ return _avx2_bitset_container_##opname##_justcard(src_1, src_2); \ } else { \ return _scalar_bitset_container_##opname##_justcard(src_1, src_2); \ @@ -893,7 +899,7 @@ int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \ } \ int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \ const bitset_container_t *src_2) { \ - const uint64_t * __restrict__ words_1 = src_1->words; \ + printf("A1\n"); const uint64_t * __restrict__ words_1 = src_1->words; \ const uint64_t * __restrict__ words_2 = src_2->words; \ int32_t sum = 0; \ for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) { \ @@ -927,13 +933,14 @@ int bitset_container_to_uint32_array( uint32_t base ){ #ifdef CROARING_IS_X64 + int support = croaring_hardware_support(); #if CROARING_COMPILER_SUPPORTS_AVX512 - if(( croaring_avx512() ) && (bc->cardinality >= 8192)) // heuristic + if(( support & ROARING_SUPPORTS_AVX512 ) && (bc->cardinality >= 8192)) // heuristic return (int) bitset_extract_setbits_avx512(bc->words, BITSET_CONTAINER_SIZE_IN_WORDS, out, bc->cardinality, base); else #endif - if(( croaring_avx2() ) && (bc->cardinality >= 8192)) // heuristic + if(( support & ROARING_SUPPORTS_AVX2 ) && (bc->cardinality >= 8192)) // heuristic return (int) bitset_extract_setbits_avx2(bc->words, BITSET_CONTAINER_SIZE_IN_WORDS, out, bc->cardinality, base); else @@ -1103,13 +1110,14 @@ bool bitset_container_equals(const bitset_container_t *container1, const bitset_ } } #ifdef CROARING_IS_X64 + int support = croaring_hardware_support(); #if CROARING_COMPILER_SUPPORTS_AVX512 - if( croaring_avx512() ) { + if( support & ROARING_SUPPORTS_AVX512 ) { return _avx512_bitset_container_equals(container1, container2); } else #endif - if( croaring_avx2() ) { + if( support & ROARING_SUPPORTS_AVX2 ) { return _avx2_bitset_container_equals(container1, container2); } #endif diff --git a/src/containers/convert.c b/src/containers/convert.c index ec3b94ceb..743f62184 100644 --- a/src/containers/convert.c +++ b/src/containers/convert.c @@ -5,6 +5,12 @@ #include #include +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + #ifdef __cplusplus extern "C" { namespace roaring { namespace internal { #endif @@ -50,7 +56,7 @@ array_container_t *array_container_from_bitset(const bitset_container_t *bits) { result->cardinality = bits->cardinality; #if CROARING_IS_X64 #if CROARING_COMPILER_SUPPORTS_AVX512 - if( croaring_avx512() ) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX512 ) { bitset_extract_setbits_avx512_uint16(bits->words, BITSET_CONTAINER_SIZE_IN_WORDS, result->array, bits->cardinality , 0); } else diff --git a/src/containers/run.c b/src/containers/run.c index a32e476f0..ed3c6c4f8 100644 --- a/src/containers/run.c +++ b/src/containers/run.c @@ -5,6 +5,12 @@ #include #include +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + #ifdef __cplusplus extern "C" { namespace roaring { namespace internal { #endif @@ -924,12 +930,12 @@ static inline int _scalar_run_container_cardinality(const run_container_t *run) int run_container_cardinality(const run_container_t *run) { #if CROARING_COMPILER_SUPPORTS_AVX512 - if( croaring_avx512() ) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX512 ) { return _avx512_run_container_cardinality(run); } else #endif - if( croaring_avx2() ) { + if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) { return _avx2_run_container_cardinality(run); } else { return _scalar_run_container_cardinality(run); diff --git a/src/isadetection.c b/src/isadetection.c new file mode 100644 index 000000000..272577aa0 --- /dev/null +++ b/src/isadetection.c @@ -0,0 +1,268 @@ + +/* From +https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h +Highly modified. + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, +Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute +(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, +Samy Bengio, Johnny Mariethoz) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories +America and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include + + +// We need portability.h to be included first, see +// https://github.com/RoaringBitmap/CRoaring/issues/394 +#include +#if CROARING_REGULAR_VISUAL_STUDIO +#include +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) +#include +#endif // CROARING_REGULAR_VISUAL_STUDIO +#include + +#if CROARING_IS_X64 +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." +#endif // CROARING_COMPILER_SUPPORTS_AVX512 +#endif + +#ifdef __cplusplus +extern "C" { namespace roaring { namespace internal { +#endif +enum croaring_instruction_set { + CROARING_DEFAULT = 0x0, + CROARING_NEON = 0x1, + CROARING_AVX2 = 0x4, + CROARING_SSE42 = 0x8, + CROARING_PCLMULQDQ = 0x10, + CROARING_BMI1 = 0x20, + CROARING_BMI2 = 0x40, + CROARING_ALTIVEC = 0x80, + CROARING_AVX512F = 0x100, + CROARING_AVX512DQ = 0x200, + CROARING_AVX512BW = 0x400, + CROARING_AVX512VBMI2 = 0x800, + CROARING_AVX512BITALG = 0x1000, + CROARING_AVX512VPOPCNTDQ = 0x2000, + CROARING_UNINITIALIZED = 0x8000 +}; + +#if CROARING_COMPILER_SUPPORTS_AVX512 +static unsigned int CROARING_AVX512_REQUIRED = (CROARING_AVX512F | CROARING_AVX512DQ | CROARING_AVX512BW | CROARING_AVX512VBMI2 | CROARING_AVX512BITALG | CROARING_AVX512VPOPCNTDQ); +#endif + +#if defined(__x86_64__) || defined(_M_AMD64) // x64 + + +static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, + uint32_t *edx) { +#if CROARING_REGULAR_VISUAL_STUDIO + int cpu_info[4]; + __cpuid(cpu_info, *eax); + *eax = cpu_info[0]; + *ebx = cpu_info[1]; + *ecx = cpu_info[2]; + *edx = cpu_info[3]; +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) + uint32_t level = *eax; + __get_cpuid(level, eax, ebx, ecx, edx); +#else + uint32_t a = *eax, b, c = *ecx, d; + __asm__("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); + *eax = a; + *ebx = b; + *ecx = c; + *edx = d; +#endif +} + +/** + * This is a relatively expensive function but it will get called at most + * *once* per compilation units. Normally, the CRoaring library is built + * as one compilation unit. + */ +static inline uint32_t dynamic_croaring_detect_supported_architectures() { + uint32_t eax, ebx, ecx, edx; + uint32_t host_isa = 0x0; + // Can be found on Intel ISA Reference for CPUID + static uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7 + static uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7 + static uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7 + static uint32_t cpuid_avx512f_bit = 1 << 16; ///< @private bit 16 of EBX for EAX=0x7 + static uint32_t cpuid_avx512dq_bit = 1 << 17; ///< @private bit 17 of EBX for EAX=0x7 + static uint32_t cpuid_avx512bw_bit = 1 << 30; ///< @private bit 30 of EBX for EAX=0x7 + static uint32_t cpuid_avx512vbmi2_bit = 1 << 6; ///< @private bit 6 of ECX for EAX=0x7 + static uint32_t cpuid_avx512bitalg_bit = 1 << 12; ///< @private bit 12 of ECX for EAX=0x7 + static uint32_t cpuid_avx512vpopcntdq_bit = 1 << 14; ///< @private bit 14 of ECX for EAX=0x7 + static uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1 + static uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1 + // ECX for EAX=0x7 + eax = 0x7; + ecx = 0x0; + cpuid(&eax, &ebx, &ecx, &edx); + if (ebx & cpuid_avx2_bit) { + host_isa |= CROARING_AVX2; + } + if (ebx & cpuid_bmi1_bit) { + host_isa |= CROARING_BMI1; + } + + if (ebx & cpuid_bmi2_bit) { + host_isa |= CROARING_BMI2; + } + + if (ebx & cpuid_avx512f_bit) { + host_isa |= CROARING_AVX512F; + } + + if (ebx & cpuid_avx512bw_bit) { + host_isa |= CROARING_AVX512BW; + } + + if (ebx & cpuid_avx512dq_bit) { + host_isa |= CROARING_AVX512DQ; + } + + if (ecx & cpuid_avx512vbmi2_bit) { + host_isa |= CROARING_AVX512VBMI2; + } + + if (ecx & cpuid_avx512bitalg_bit) { + host_isa |= CROARING_AVX512BITALG; + } + + if (ecx & cpuid_avx512vpopcntdq_bit) { + host_isa |= CROARING_AVX512VPOPCNTDQ; + } + + // EBX for EAX=0x1 + eax = 0x1; + cpuid(&eax, &ebx, &ecx, &edx); + + if (ecx & cpuid_sse42_bit) { + host_isa |= CROARING_SSE42; + } + + if (ecx & cpuid_pclmulqdq_bit) { + host_isa |= CROARING_PCLMULQDQ; + } + + return host_isa; +} + +#endif // end SIMD extension detection code + + +#if defined(__x86_64__) || defined(_M_AMD64) // x64 + +#if defined(__cplusplus) +static inline uint32_t croaring_detect_supported_architectures() { + // thread-safe as per the C++11 standard. + static uint32_t buffer = dynamic_croaring_detect_supported_architectures(); + return buffer; +} +#elif CROARING_VISUAL_STUDIO +// Visual Studio does not support C11 atomics. +static inline uint32_t croaring_detect_supported_architectures() { + static int buffer = CROARING_UNINITIALIZED; + if (buffer == CROARING_UNINITIALIZED) { + buffer = dynamic_croaring_detect_supported_architectures(); + } + return buffer; +} +#else // CROARING_VISUAL_STUDIO +#include +uint32_t croaring_detect_supported_architectures() { + // we use an atomic for thread safety + static _Atomic uint32_t buffer = CROARING_UNINITIALIZED; + if (buffer == CROARING_UNINITIALIZED) { + // atomicity is sufficient + buffer = dynamic_croaring_detect_supported_architectures(); + } + return buffer; +} +#endif // CROARING_REGULAR_VISUAL_STUDIO + +#ifdef ROARING_DISABLE_AVX + +int croaring_hardware_support() { + return 0; +} + +#elif defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VBMI2__) && defined(__AVX512BITALG__) && defined(__AVX512VPOPCNTDQ__) +int croaring_hardware_support() { + return ROARING_SUPPORTS_AVX2 | ROARING_SUPPORTS_AVX512 +} +#elif defined(__AVX2__) + +int croaring_hardware_support() { + static int support = 0xFFFFFFF; + if(support == 0xFFFFFFF) { + bool avx512_support = false; +#if CROARING_COMPILER_SUPPORTS_AVX512 + avx512_support = ( (croaring_detect_supported_architectures() & CROARING_AVX512_REQUIRED) + == CROARING_AVX512_REQUIRED); +#endif + support = ROARING_SUPPORTS_AVX2 | (croaring_has_avx512() ? ROARING_SUPPORTS_AVX512 : 0); + } + return support; +} +#else + +int croaring_hardware_support() { + static int support = 0xFFFFFFF; + if(support == 0xFFFFFFF) { + bool has_avx2 = (croaring_detect_supported_architectures() & CROARING_AVX2) == CROARING_AVX2; + bool has_avx512 = false; +#if CROARING_COMPILER_SUPPORTS_AVX512 + has_avx512 = (croaring_detect_supported_architectures() & CROARING_AVX512_REQUIRED) == CROARING_AVX512_REQUIRED; +#endif // CROARING_COMPILER_SUPPORTS_AVX512 + support = (has_avx2 ? ROARING_SUPPORTS_AVX2 : 0) | (has_avx512 ? ROARING_SUPPORTS_AVX512 : 0); + } + return support; +} +#endif + +#ifdef __cplusplus +} } } // extern "C" { namespace roaring { namespace internal { +#endif +#endif // defined(__x86_64__) || defined(_M_AMD64) // x64 \ No newline at end of file diff --git a/src/roaring.c b/src/roaring.c index 6557ff45a..234356e0f 100644 --- a/src/roaring.c +++ b/src/roaring.c @@ -2790,7 +2790,6 @@ uint64_t roaring_bitmap_and_cardinality(const roaring_bitmap_t *x1, length2 = x2->high_low_container.size; uint64_t answer = 0; int pos1 = 0, pos2 = 0; - while (pos1 < length1 && pos2 < length2) { const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1); const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2); From 9ab9b15de5eeec5359d96ae0d41c079f633d3d11 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 1 Apr 2023 17:18:35 -0400 Subject: [PATCH 05/10] Documentation. --- CMakeLists.txt | 1 - README.md | 34 ++++++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 835672c87..123bc2574 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,6 @@ endif() option(ROARING_DISABLE_X64 "Forcefully disable x64 optimizations even if hardware supports it (this disables AVX)" OFF) option(ROARING_DISABLE_AVX "Forcefully disable AVX even if hardware supports it " OFF) option(ROARING_DISABLE_NEON "Forcefully disable NEON even if hardware supports it" OFF) -option(ROARING_DISABLE_NATIVE "Forcefully disable -march optimizations (obsolete)" OFF) option(ROARING_DISABLE_AVX512 "Forcefully disable AVX512 even if compiler supports it" OFF) option(ROARING_BUILD_STATIC "Build a static library" ON) diff --git a/README.md b/README.md index cdf19907f..2d91e0668 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,32 @@ The C interface is found in the file ``include/roaring/roaring.h``. We have C++ Some users have to deal with large volumes of data. It may be important for these users to be aware of the `addMany` (C++) `roaring_bitmap_or_many` (C) functions as it is much faster and economical to add values in batches when possible. Furthermore, calling periodically the `runOptimize` (C++) or `roaring_bitmap_run_optimize` (C) functions may help. + +# Running microbenchmarks + +We have microbenchmarks constructed with the Google Benchmarks. +Under Linux or macOS, you may run them as follows: + +``` +cmake --build build +./build/microbenchmarks/bench +``` + +By default, the benchmark tools picks one data set (e.g., `CRoaring/benchmarks/realdata/census1881`). +We have several data sets and you may pick others: + +``` +./build/microbenchmarks/bench benchmarks/realdata/wikileaks-noquotes +``` + +You may disable some functionality for the purpose of benchmarking. For example, you could +benchmark the code without AVX-512 even if both your processor and compiler supports it: + +``` +cmake --buildnoavx512 -D ROARING_DISABLE_AVX512=OFF +./buildnoavx512/microbenchmarks/bench +``` + # Custom memory allocators For general users, CRoaring would apply default allocator without extra codes. But global memory hook is also provided for those who want a custom memory allocator. Here is an example: ```C @@ -575,14 +601,6 @@ ctest ``` -To run real-data benchmark - -``` -./real_bitmaps_benchmark ../benchmarks/realdata/census1881 -``` -where you must adjust the path "../benchmarks/realdata/census1881" so that it points to one of the directories in the benchmarks/realdata directory. - - To check that your code abides by the style convention (make sure that ``clang-format`` is installed): ``` From 202fb9b45b72de252b9ab385d798cb77d646ae7b Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 1 Apr 2023 21:29:14 +0000 Subject: [PATCH 06/10] Sensible? --- src/array_util.c | 1 + src/bitset_util.c | 1 + 2 files changed, 2 insertions(+) diff --git a/src/array_util.c b/src/array_util.c index 867f50b0a..f19b9ac5a 100644 --- a/src/array_util.c +++ b/src/array_util.c @@ -16,6 +16,7 @@ #endif #ifdef __cplusplus +using namespace ::roaring::internal; extern "C" { namespace roaring { namespace internal { #endif diff --git a/src/bitset_util.c b/src/bitset_util.c index c249560d1..427d95901 100644 --- a/src/bitset_util.c +++ b/src/bitset_util.c @@ -13,6 +13,7 @@ #endif #ifdef __cplusplus +using namespace ::roaring::internal; extern "C" { namespace roaring { namespace api { #endif From a00f0b682a7ce36f433f63e8917a306dfba1cc54 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 1 Apr 2023 17:40:59 -0400 Subject: [PATCH 07/10] Minor fix for windows --- microbenchmarks/bench.h | 8 +- microbenchmarks/toni_ronnko_dirent.h | 1075 ++++++++++++++++++++++++++ 2 files changed, 1082 insertions(+), 1 deletion(-) create mode 100644 microbenchmarks/toni_ronnko_dirent.h diff --git a/microbenchmarks/bench.h b/microbenchmarks/bench.h index 4f18212f7..7a8d0662c 100644 --- a/microbenchmarks/bench.h +++ b/microbenchmarks/bench.h @@ -2,12 +2,18 @@ #define CROARING_MICROBENCHMARKS_BENCH_H // clang-format off #include -#include #include #include #include #include +#if (!defined(_WIN32) && !defined(_WIN64) && !(__MINGW32__) && !(__MINGW64__)) +#include +#else +#include "toni_ronnko_dirent.h" +#endif + + #include #include diff --git a/microbenchmarks/toni_ronnko_dirent.h b/microbenchmarks/toni_ronnko_dirent.h new file mode 100644 index 000000000..a9356644f --- /dev/null +++ b/microbenchmarks/toni_ronnko_dirent.h @@ -0,0 +1,1075 @@ +/* + * Dirent interface for Microsoft Visual Studio + * + * Copyright (C) 1998-2019 Toni Ronkko + * This file is part of dirent. Dirent may be freely distributed + * under the MIT license. For all details and documentation, see + * https://github.com/tronkko/dirent + */ +#ifndef DIRENT_H +#define DIRENT_H + +/* Hide warnings about unreferenced local functions */ +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wunused-function" +#elif defined(_MSC_VER) +#pragma warning(disable : 4505) +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + +/* + * Include windows.h without Windows Sockets 1.1 to prevent conflicts with + * Windows Sockets 2.0. + */ +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Indicates that d_type field is available in dirent structure */ +#define _DIRENT_HAVE_D_TYPE + +/* Indicates that d_namlen field is available in dirent structure */ +#define _DIRENT_HAVE_D_NAMLEN + +/* Entries missing from MSVC 6.0 */ +#if !defined(FILE_ATTRIBUTE_DEVICE) +#define FILE_ATTRIBUTE_DEVICE 0x40 +#endif + +/* File type and permission flags for stat(), general mask */ +#if !defined(S_IFMT) +#define S_IFMT _S_IFMT +#endif + +/* Directory bit */ +#if !defined(S_IFDIR) +#define S_IFDIR _S_IFDIR +#endif + +/* Character device bit */ +#if !defined(S_IFCHR) +#define S_IFCHR _S_IFCHR +#endif + +/* Pipe bit */ +#if !defined(S_IFFIFO) +#define S_IFFIFO _S_IFFIFO +#endif + +/* Regular file bit */ +#if !defined(S_IFREG) +#define S_IFREG _S_IFREG +#endif + +/* Read permission */ +#if !defined(S_IREAD) +#define S_IREAD _S_IREAD +#endif + +/* Write permission */ +#if !defined(S_IWRITE) +#define S_IWRITE _S_IWRITE +#endif + +/* Execute permission */ +#if !defined(S_IEXEC) +#define S_IEXEC _S_IEXEC +#endif + +/* Pipe */ +#if !defined(S_IFIFO) +#define S_IFIFO _S_IFIFO +#endif + +/* Block device */ +#if !defined(S_IFBLK) +#define S_IFBLK 0 +#endif + +/* Link */ +#if !defined(S_IFLNK) +#define S_IFLNK 0 +#endif + +/* Socket */ +#if !defined(S_IFSOCK) +#define S_IFSOCK 0 +#endif + +/* Read user permission */ +#if !defined(S_IRUSR) +#define S_IRUSR S_IREAD +#endif + +/* Write user permission */ +#if !defined(S_IWUSR) +#define S_IWUSR S_IWRITE +#endif + +/* Execute user permission */ +#if !defined(S_IXUSR) +#define S_IXUSR 0 +#endif + +/* Read group permission */ +#if !defined(S_IRGRP) +#define S_IRGRP 0 +#endif + +/* Write group permission */ +#if !defined(S_IWGRP) +#define S_IWGRP 0 +#endif + +/* Execute group permission */ +#if !defined(S_IXGRP) +#define S_IXGRP 0 +#endif + +/* Read others permission */ +#if !defined(S_IROTH) +#define S_IROTH 0 +#endif + +/* Write others permission */ +#if !defined(S_IWOTH) +#define S_IWOTH 0 +#endif + +/* Execute others permission */ +#if !defined(S_IXOTH) +#define S_IXOTH 0 +#endif + +/* Maximum length of file name */ +#if !defined(PATH_MAX) +#define PATH_MAX MAX_PATH +#endif +#if !defined(FILENAME_MAX) +#define FILENAME_MAX MAX_PATH +#endif +#if !defined(NAME_MAX) +#define NAME_MAX FILENAME_MAX +#endif + +/* File type flags for d_type */ +#define DT_UNKNOWN 0 +#define DT_REG S_IFREG +#define DT_DIR S_IFDIR +#define DT_FIFO S_IFIFO +#define DT_SOCK S_IFSOCK +#define DT_CHR S_IFCHR +#define DT_BLK S_IFBLK +#define DT_LNK S_IFLNK + +/* Macros for converting between st_mode and d_type */ +#define IFTODT(mode) ((mode)&S_IFMT) +#define DTTOIF(type) (type) + +/* + * File type macros. Note that block devices, sockets and links cannot be + * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are + * only defined for compatibility. These macros should always return false + * on Windows. + */ +#if !defined(S_ISFIFO) +#define S_ISFIFO(mode) (((mode)&S_IFMT) == S_IFIFO) +#endif +#if !defined(S_ISDIR) +#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) +#endif +#if !defined(S_ISREG) +#define S_ISREG(mode) (((mode)&S_IFMT) == S_IFREG) +#endif +#if !defined(S_ISLNK) +#define S_ISLNK(mode) (((mode)&S_IFMT) == S_IFLNK) +#endif +#if !defined(S_ISSOCK) +#define S_ISSOCK(mode) (((mode)&S_IFMT) == S_IFSOCK) +#endif +#if !defined(S_ISCHR) +#define S_ISCHR(mode) (((mode)&S_IFMT) == S_IFCHR) +#endif +#if !defined(S_ISBLK) +#define S_ISBLK(mode) (((mode)&S_IFMT) == S_IFBLK) +#endif + +/* Return the exact length of the file name without zero terminator */ +#define _D_EXACT_NAMLEN(p) ((p)->d_namlen) + +/* Return the maximum size of a file name */ +#define _D_ALLOC_NAMLEN(p) ((PATH_MAX) + 1) + +#ifdef __cplusplus +extern "C" { +#endif + +/* Wide-character version */ +struct _wdirent { + /* Always zero */ + long d_ino; + + /* File position within stream */ + long d_off; + + /* Structure size */ + unsigned short d_reclen; + + /* Length of name without \0 */ + size_t d_namlen; + + /* File type */ + int d_type; + + /* File name */ + wchar_t d_name[PATH_MAX + 1]; +}; +typedef struct _wdirent _wdirent; + +struct _WDIR { + /* Current directory entry */ + struct _wdirent ent; + + /* Private file data */ + WIN32_FIND_DATAW data; + + /* True if data is valid */ + int cached; + + /* Win32 search handle */ + HANDLE handle; + + /* Initial directory name */ + wchar_t *patt; +}; +typedef struct _WDIR _WDIR; + +/* Multi-byte character version */ +struct dirent { + /* Always zero */ + long d_ino; + + /* File position within stream */ + long d_off; + + /* Structure size */ + unsigned short d_reclen; + + /* Length of name without \0 */ + size_t d_namlen; + + /* File type */ + int d_type; + + /* File name */ + char d_name[PATH_MAX + 1]; +}; +typedef struct dirent dirent; + +struct DIR { + struct dirent ent; + struct _WDIR *wdirp; +}; +typedef struct DIR DIR; + +/* Dirent functions */ +static DIR *opendir(const char *dirname); +static _WDIR *_wopendir(const wchar_t *dirname); + +static struct dirent *readdir(DIR *dirp); +static struct _wdirent *_wreaddir(_WDIR *dirp); + +static int readdir_r(DIR *dirp, struct dirent *entry, struct dirent **result); +static int _wreaddir_r(_WDIR *dirp, struct _wdirent *entry, + struct _wdirent **result); + +static int closedir(DIR *dirp); +static int _wclosedir(_WDIR *dirp); + +static void rewinddir(DIR *dirp); +static void _wrewinddir(_WDIR *dirp); + +static int scandir(const char *dirname, struct dirent ***namelist, + int (*filter)(const struct dirent *), + int (*compare)(const struct dirent **, + const struct dirent **)); + +static int alphasort(const struct dirent **a, const struct dirent **b); + +static int versionsort(const struct dirent **a, const struct dirent **b); + +/* For compatibility with Symbian */ +#define wdirent _wdirent +#define WDIR _WDIR +#define wopendir _wopendir +#define wreaddir _wreaddir +#define wclosedir _wclosedir +#define wrewinddir _wrewinddir + +/* Internal utility functions */ +static WIN32_FIND_DATAW *dirent_first(_WDIR *dirp); +static WIN32_FIND_DATAW *dirent_next(_WDIR *dirp); + +static int dirent_mbstowcs_s(size_t *pReturnValue, wchar_t *wcstr, + size_t sizeInWords, const char *mbstr, + size_t count); + +static int dirent_wcstombs_s(size_t *pReturnValue, char *mbstr, + size_t sizeInBytes, const wchar_t *wcstr, + size_t count); + +static void dirent_set_errno(int error); + +/* + * Open directory stream DIRNAME for read and return a pointer to the + * internal working area that is used to retrieve individual directory + * entries. + */ +static _WDIR *_wopendir(const wchar_t *dirname) { + _WDIR *dirp; +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) + /* Desktop */ + DWORD n; +#else + /* WinRT */ + size_t n; +#endif + wchar_t *p; + + /* Must have directory name */ + if (dirname == NULL || dirname[0] == '\0') { + dirent_set_errno(ENOENT); + return NULL; + } + + /* Allocate new _WDIR structure */ + dirp = (_WDIR *)malloc(sizeof(struct _WDIR)); + if (!dirp) { + return NULL; + } + + /* Reset _WDIR structure */ + dirp->handle = INVALID_HANDLE_VALUE; + dirp->patt = NULL; + dirp->cached = 0; + + /* + * Compute the length of full path plus zero terminator + * + * Note that on WinRT there's no way to convert relative paths + * into absolute paths, so just assume it is an absolute path. + */ +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) + /* Desktop */ + n = GetFullPathNameW(dirname, 0, NULL, NULL); +#else + /* WinRT */ + n = wcslen(dirname); +#endif + + /* Allocate room for absolute directory name and search pattern */ + dirp->patt = (wchar_t *)malloc(sizeof(wchar_t) * n + 16); + if (dirp->patt == NULL) { + goto exit_closedir; + } + + /* + * Convert relative directory name to an absolute one. This + * allows rewinddir() to function correctly even when current + * working directory is changed between opendir() and rewinddir(). + * + * Note that on WinRT there's no way to convert relative paths + * into absolute paths, so just assume it is an absolute path. + */ +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) + /* Desktop */ + n = GetFullPathNameW(dirname, n, dirp->patt, NULL); + if (n <= 0) { + goto exit_closedir; + } +#else + /* WinRT */ + wcsncpy_s(dirp->patt, n + 1, dirname, n); +#endif + + /* Append search pattern \* to the directory name */ + p = dirp->patt + n; + switch (p[-1]) { + case '\\': + case '/': + case ':': + /* Directory ends in path separator, e.g. c:\temp\ */ + /*NOP*/; + break; + + default: + /* Directory name doesn't end in path separator */ + *p++ = '\\'; + } + *p++ = '*'; + *p = '\0'; + + /* Open directory stream and retrieve the first entry */ + if (!dirent_first(dirp)) { + goto exit_closedir; + } + + /* Success */ + return dirp; + + /* Failure */ +exit_closedir: + _wclosedir(dirp); + return NULL; +} + +/* + * Read next directory entry. + * + * Returns pointer to static directory entry which may be overwritten by + * subsequent calls to _wreaddir(). + */ +static struct _wdirent *_wreaddir(_WDIR *dirp) { + struct _wdirent *entry; + + /* + * Read directory entry to buffer. We can safely ignore the return value + * as entry will be set to NULL in case of error. + */ + (void)_wreaddir_r(dirp, &dirp->ent, &entry); + + /* Return pointer to statically allocated directory entry */ + return entry; +} + +/* + * Read next directory entry. + * + * Returns zero on success. If end of directory stream is reached, then sets + * result to NULL and returns zero. + */ +static int _wreaddir_r(_WDIR *dirp, struct _wdirent *entry, + struct _wdirent **result) { + WIN32_FIND_DATAW *datap; + + /* Read next directory entry */ + datap = dirent_next(dirp); + if (datap) { + size_t n; + DWORD attr; + + /* + * Copy file name as wide-character string. If the file name is too + * long to fit in to the destination buffer, then truncate file name + * to PATH_MAX characters and zero-terminate the buffer. + */ + n = 0; + while (n < PATH_MAX && datap->cFileName[n] != 0) { + entry->d_name[n] = datap->cFileName[n]; + n++; + } + entry->d_name[n] = 0; + + /* Length of file name excluding zero terminator */ + entry->d_namlen = n; + + /* File type */ + attr = datap->dwFileAttributes; + if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) { + entry->d_type = DT_CHR; + } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) { + entry->d_type = DT_DIR; + } else { + entry->d_type = DT_REG; + } + + /* Reset dummy fields */ + entry->d_ino = 0; + entry->d_off = 0; + entry->d_reclen = sizeof(struct _wdirent); + + /* Set result address */ + *result = entry; + + } else { + + /* Return NULL to indicate end of directory */ + *result = NULL; + } + + return /*OK*/ 0; +} + +/* + * Close directory stream opened by opendir() function. This invalidates the + * DIR structure as well as any directory entry read previously by + * _wreaddir(). + */ +static int _wclosedir(_WDIR *dirp) { + int ok; + if (dirp) { + + /* Release search handle */ + if (dirp->handle != INVALID_HANDLE_VALUE) { + FindClose(dirp->handle); + } + + /* Release search pattern */ + free(dirp->patt); + + /* Release directory structure */ + free(dirp); + ok = /*success*/ 0; + + } else { + + /* Invalid directory stream */ + dirent_set_errno(EBADF); + ok = /*failure*/ -1; + } + return ok; +} + +/* + * Rewind directory stream such that _wreaddir() returns the very first + * file name again. + */ +static void _wrewinddir(_WDIR *dirp) { + if (dirp) { + /* Release existing search handle */ + if (dirp->handle != INVALID_HANDLE_VALUE) { + FindClose(dirp->handle); + } + + /* Open new search handle */ + dirent_first(dirp); + } +} + +/* Get first directory entry (internal) */ +static WIN32_FIND_DATAW *dirent_first(_WDIR *dirp) { + WIN32_FIND_DATAW *datap; + DWORD error; + + /* Open directory and retrieve the first entry */ + dirp->handle = FindFirstFileExW(dirp->patt, FindExInfoStandard, &dirp->data, + FindExSearchNameMatch, NULL, 0); + if (dirp->handle != INVALID_HANDLE_VALUE) { + + /* a directory entry is now waiting in memory */ + datap = &dirp->data; + dirp->cached = 1; + + } else { + + /* Failed to open directory: no directory entry in memory */ + dirp->cached = 0; + datap = NULL; + + /* Set error code */ + error = GetLastError(); + switch (error) { + case ERROR_ACCESS_DENIED: + /* No read access to directory */ + dirent_set_errno(EACCES); + break; + + case ERROR_DIRECTORY: + /* Directory name is invalid */ + dirent_set_errno(ENOTDIR); + break; + + case ERROR_PATH_NOT_FOUND: + default: + /* Cannot find the file */ + dirent_set_errno(ENOENT); + } + } + return datap; +} + +/* + * Get next directory entry (internal). + * + * Returns + */ +static WIN32_FIND_DATAW *dirent_next(_WDIR *dirp) { + WIN32_FIND_DATAW *p; + + /* Get next directory entry */ + if (dirp->cached != 0) { + + /* A valid directory entry already in memory */ + p = &dirp->data; + dirp->cached = 0; + + } else if (dirp->handle != INVALID_HANDLE_VALUE) { + + /* Get the next directory entry from stream */ + if (FindNextFileW(dirp->handle, &dirp->data) != FALSE) { + /* Got a file */ + p = &dirp->data; + } else { + /* The very last entry has been processed or an error occurred */ + FindClose(dirp->handle); + dirp->handle = INVALID_HANDLE_VALUE; + p = NULL; + } + + } else { + + /* End of directory stream reached */ + p = NULL; + } + + return p; +} + +/* + * Open directory stream using plain old C-string. + */ +static DIR *opendir(const char *dirname) { + struct DIR *dirp; + + /* Must have directory name */ + if (dirname == NULL || dirname[0] == '\0') { + dirent_set_errno(ENOENT); + return NULL; + } + + /* Allocate memory for DIR structure */ + dirp = (DIR *)malloc(sizeof(struct DIR)); + if (!dirp) { + return NULL; + } + { + int error; + wchar_t wname[PATH_MAX + 1]; + size_t n; + + /* Convert directory name to wide-character string */ + error = dirent_mbstowcs_s(&n, wname, PATH_MAX + 1, dirname, PATH_MAX + 1); + if (error) { + /* + * Cannot convert file name to wide-character string. This + * occurs if the string contains invalid multi-byte sequences or + * the output buffer is too small to contain the resulting + * string. + */ + goto exit_free; + } + + /* Open directory stream using wide-character name */ + dirp->wdirp = _wopendir(wname); + if (!dirp->wdirp) { + goto exit_free; + } + } + + /* Success */ + return dirp; + + /* Failure */ +exit_free: + free(dirp); + return NULL; +} + +/* + * Read next directory entry. + */ +static struct dirent *readdir(DIR *dirp) { + struct dirent *entry; + + /* + * Read directory entry to buffer. We can safely ignore the return value + * as entry will be set to NULL in case of error. + */ + (void)readdir_r(dirp, &dirp->ent, &entry); + + /* Return pointer to statically allocated directory entry */ + return entry; +} + +/* + * Read next directory entry into called-allocated buffer. + * + * Returns zero on success. If the end of directory stream is reached, then + * sets result to NULL and returns zero. + */ +static int readdir_r(DIR *dirp, struct dirent *entry, struct dirent **result) { + WIN32_FIND_DATAW *datap; + + /* Read next directory entry */ + datap = dirent_next(dirp->wdirp); + if (datap) { + size_t n; + int error; + + /* Attempt to convert file name to multi-byte string */ + error = dirent_wcstombs_s(&n, entry->d_name, PATH_MAX + 1, datap->cFileName, + PATH_MAX + 1); + + /* + * If the file name cannot be represented by a multi-byte string, + * then attempt to use old 8+3 file name. This allows traditional + * Unix-code to access some file names despite of unicode + * characters, although file names may seem unfamiliar to the user. + * + * Be ware that the code below cannot come up with a short file + * name unless the file system provides one. At least + * VirtualBox shared folders fail to do this. + */ + if (error && datap->cAlternateFileName[0] != '\0') { + error = dirent_wcstombs_s(&n, entry->d_name, PATH_MAX + 1, + datap->cAlternateFileName, PATH_MAX + 1); + } + + if (!error) { + DWORD attr; + + /* Length of file name excluding zero terminator */ + entry->d_namlen = n - 1; + + /* File attributes */ + attr = datap->dwFileAttributes; + if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) { + entry->d_type = DT_CHR; + } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) { + entry->d_type = DT_DIR; + } else { + entry->d_type = DT_REG; + } + + /* Reset dummy fields */ + entry->d_ino = 0; + entry->d_off = 0; + entry->d_reclen = sizeof(struct dirent); + + } else { + + /* + * Cannot convert file name to multi-byte string so construct + * an erroneous directory entry and return that. Note that + * we cannot return NULL as that would stop the processing + * of directory entries completely. + */ + entry->d_name[0] = '?'; + entry->d_name[1] = '\0'; + entry->d_namlen = 1; + entry->d_type = DT_UNKNOWN; + entry->d_ino = 0; + entry->d_off = -1; + entry->d_reclen = 0; + } + + /* Return pointer to directory entry */ + *result = entry; + + } else { + + /* No more directory entries */ + *result = NULL; + } + + return /*OK*/ 0; +} + +/* + * Close directory stream. + */ +static int closedir(DIR *dirp) { + int ok; + if (dirp) { + + /* Close wide-character directory stream */ + ok = _wclosedir(dirp->wdirp); + dirp->wdirp = NULL; + + /* Release multi-byte character version */ + free(dirp); + + } else { + + /* Invalid directory stream */ + dirent_set_errno(EBADF); + ok = /*failure*/ -1; + } + return ok; +} + +/* + * Rewind directory stream to beginning. + */ +static void rewinddir(DIR *dirp) { + /* Rewind wide-character string directory stream */ + _wrewinddir(dirp->wdirp); +} + +/* + * Scan directory for entries. + */ +static int scandir(const char *dirname, struct dirent ***namelist, + int (*filter)(const struct dirent *), + int (*compare)(const struct dirent **, + const struct dirent **)) { + struct dirent **files = NULL; + size_t size = 0; + size_t allocated = 0; + const size_t init_size = 1; + DIR *dir = NULL; + struct dirent *entry; + struct dirent *tmp = NULL; + size_t i; + int result = 0; + + /* Open directory stream */ + dir = opendir(dirname); + if (dir) { + + /* Read directory entries to memory */ + while (1) { + + /* Enlarge pointer table to make room for another pointer */ + if (size >= allocated) { + void *p; + size_t num_entries; + + /* Compute number of entries in the enlarged pointer table */ + if (size < init_size) { + /* Allocate initial pointer table */ + num_entries = init_size; + } else { + /* Double the size */ + num_entries = size * 2; + } + + /* Allocate first pointer table or enlarge existing table */ + p = realloc(files, sizeof(void *) * num_entries); + if (p != NULL) { + /* Got the memory */ + files = (dirent **)p; + allocated = num_entries; + } else { + /* Out of memory */ + result = -1; + break; + } + } + + /* Allocate room for temporary directory entry */ + if (tmp == NULL) { + tmp = (struct dirent *)malloc(sizeof(struct dirent)); + if (tmp == NULL) { + /* Cannot allocate temporary directory entry */ + result = -1; + break; + } + } + + /* Read directory entry to temporary area */ + if (readdir_r(dir, tmp, &entry) == /*OK*/ 0) { + + /* Did we get an entry? */ + if (entry != NULL) { + int pass; + + /* Determine whether to include the entry in result */ + if (filter) { + /* Let the filter function decide */ + pass = filter(tmp); + } else { + /* No filter function, include everything */ + pass = 1; + } + + if (pass) { + /* Store the temporary entry to pointer table */ + files[size++] = tmp; + tmp = NULL; + + /* Keep up with the number of files */ + result++; + } + + } else { + + /* + * End of directory stream reached => sort entries and + * exit. + */ + qsort(files, size, sizeof(void *), + (int (*)(const void *, const void *))compare); + break; + } + + } else { + /* Error reading directory entry */ + result = /*Error*/ -1; + break; + } + } + + } else { + /* Cannot open directory */ + result = /*Error*/ -1; + } + + /* Release temporary directory entry */ + free(tmp); + + /* Release allocated memory on error */ + if (result < 0) { + for (i = 0; i < size; i++) { + free(files[i]); + } + free(files); + files = NULL; + } + + /* Close directory stream */ + if (dir) { + closedir(dir); + } + + /* Pass pointer table to caller */ + if (namelist) { + *namelist = files; + } + return result; +} + +/* Alphabetical sorting */ +static int alphasort(const struct dirent **a, const struct dirent **b) { + return strcoll((*a)->d_name, (*b)->d_name); +} + +/* Sort versions */ +static int versionsort(const struct dirent **a, const struct dirent **b) { + /* FIXME: implement strverscmp and use that */ + return alphasort(a, b); +} + +/* Convert multi-byte string to wide character string */ +static int dirent_mbstowcs_s(size_t *pReturnValue, wchar_t *wcstr, + size_t sizeInWords, const char *mbstr, + size_t count) { + int error; + +#if defined(_MSC_VER) && _MSC_VER >= 1400 + + /* Microsoft Visual Studio 2005 or later */ + error = mbstowcs_s(pReturnValue, wcstr, sizeInWords, mbstr, count); + +#else + + /* Older Visual Studio or non-Microsoft compiler */ + size_t n; + + /* Convert to wide-character string (or count characters) */ + n = mbstowcs(wcstr, mbstr, sizeInWords); + if (!wcstr || n < count) { + + /* Zero-terminate output buffer */ + if (wcstr && sizeInWords) { + if (n >= sizeInWords) { + n = sizeInWords - 1; + } + wcstr[n] = 0; + } + + /* Length of resulting multi-byte string WITH zero terminator */ + if (pReturnValue) { + *pReturnValue = n + 1; + } + + /* Success */ + error = 0; + + } else { + + /* Could not convert string */ + error = 1; + } + +#endif + return error; +} + +/* Convert wide-character string to multi-byte string */ +static int dirent_wcstombs_s(size_t *pReturnValue, char *mbstr, + size_t sizeInBytes, /* max size of mbstr */ + const wchar_t *wcstr, size_t count) { + int error; + +#if defined(_MSC_VER) && _MSC_VER >= 1400 + + /* Microsoft Visual Studio 2005 or later */ + error = wcstombs_s(pReturnValue, mbstr, sizeInBytes, wcstr, count); + +#else + + /* Older Visual Studio or non-Microsoft compiler */ + size_t n; + + /* Convert to multi-byte string (or count the number of bytes needed) */ + n = wcstombs(mbstr, wcstr, sizeInBytes); + if (!mbstr || n < count) { + + /* Zero-terminate output buffer */ + if (mbstr && sizeInBytes) { + if (n >= sizeInBytes) { + n = sizeInBytes - 1; + } + mbstr[n] = '\0'; + } + + /* Length of resulting multi-bytes string WITH zero-terminator */ + if (pReturnValue) { + *pReturnValue = n + 1; + } + + /* Success */ + error = 0; + + } else { + + /* Cannot convert string */ + error = 1; + } + +#endif + return error; +} + +/* Set errno variable */ +static void dirent_set_errno(int error) { +#if defined(_MSC_VER) && _MSC_VER >= 1400 + + /* Microsoft Visual Studio 2005 and later */ + _set_errno(error); + +#else + + /* Non-Microsoft compiler or older Microsoft compiler */ + errno = error; + +#endif +} + +#ifdef __cplusplus +} +#endif +#endif /*DIRENT_H*/ \ No newline at end of file From e28937846b5c417e90e146d710ac74481c3fa32d Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 1 Apr 2023 17:51:14 -0400 Subject: [PATCH 08/10] Better guarding. --- include/roaring/isadetection.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/include/roaring/isadetection.h b/include/roaring/isadetection.h index 27677d22d..0e0ef0750 100644 --- a/include/roaring/isadetection.h +++ b/include/roaring/isadetection.h @@ -11,16 +11,20 @@ // fully supporting AVX-512. #if __has_include() #define CROARING_COMPILER_SUPPORTS_AVX512 1 -#endif -#endif +#endif // #if __has_include() +#endif // #ifdef __has_include // Visual Studio 2019 and up support AVX-512 #ifdef _MSC_VER #if _MSC_VER >= 1920 #define CROARING_COMPILER_SUPPORTS_AVX512 1 -#endif -#endif -#endif +#endif // #if _MSC_VER >= 1920 +#endif // #ifdef _MSC_VER + +#ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#define CROARING_COMPILER_SUPPORTS_AVX512 0 +#endif // #ifndef CROARING_COMPILER_SUPPORTS_AVX512 +#endif // #ifndef CROARING_COMPILER_SUPPORTS_AVX512 #ifdef __cplusplus From 5807880176541971bf810794f7ea5e24cab772ff Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 1 Apr 2023 18:25:22 -0400 Subject: [PATCH 09/10] Tweaking cmocka --- tests/CMakeLists.txt | 6 ------ tools/cmake/FindCTargets.cmake | 8 ++++---- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 875793b22..28177c01a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -40,9 +40,3 @@ endif() configure_file(${CMAKE_SOURCE_DIR}/tools/cmake/CTestCustom.cmake ${CMAKE_BINARY_DIR}) - -#set(BUILD_STATIC_LIB ON) - -#import_dependency(vendor/cmocka https://cmocka.org/files/1.1/cmocka-1.1.5.tar.xz) -#add_subdirectory(vendor/cmocka) - diff --git a/tools/cmake/FindCTargets.cmake b/tools/cmake/FindCTargets.cmake index d9330dc83..341f9b2c2 100644 --- a/tools/cmake/FindCTargets.cmake +++ b/tools/cmake/FindCTargets.cmake @@ -2,8 +2,8 @@ if (CMAKE_VERSION VERSION_GREATER 3.0.0) cmake_policy(VERSION 3.0.0) endif () include(${PROJECT_SOURCE_DIR}/tools/cmake/Import.cmake) - -import_dependency(cmocka clibs/cmocka ec387ac76d0ce9eece7cb8f523fca79f0e417ac8) +set(BUILD_STATIC_LIB ON) +import_dependency(cmocka clibs/cmocka f5e2cd7) add_dependency(cmocka) function(add_c_test TEST_NAME) @@ -13,7 +13,7 @@ function(add_c_test TEST_NAME) add_executable(${TEST_NAME} ${TEST_NAME}.c) - target_link_libraries(${TEST_NAME} roaring cmocka::cmocka) + target_link_libraries(${TEST_NAME} roaring cmocka-static) add_test(${TEST_NAME} ${TEST_NAME}) endfunction(add_c_test) @@ -29,7 +29,7 @@ if (CMAKE_VERSION VERSION_GREATER 2.8.10) endif() target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/cpp) - target_link_libraries(${TEST_NAME} roaring cmocka::cmocka) + target_link_libraries(${TEST_NAME} roaring cmocka-static) add_test(${TEST_NAME} ${TEST_NAME}) endfunction(add_cpp_test) From 210d01fe050950a7625e91f94aa6253d5024e059 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 1 Apr 2023 18:37:48 -0400 Subject: [PATCH 10/10] Guarding the benchmarks. --- CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 123bc2574..9b41269f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,8 +98,10 @@ configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/tests/config.h.in" add_subdirectory(src) if(ENABLE_ROARING_TESTS) - add_subdirectory(microbenchmarks) - add_subdirectory(benchmarks) + if(CMAKE_SIZEOF_VOID_P EQUAL 8) # we only include the benchmarks on 64-bit systems. + add_subdirectory(microbenchmarks) + add_subdirectory(benchmarks) + endif() add_subdirectory(tests) endif() # Being terse is good, but knowing how the build is configured is important