Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Hardware][Apple] Native support for macOS Apple Silicon #11696

Merged
merged 8 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 42 additions & 7 deletions cmake/cpu_extension.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(MACOSX_FOUND TRUE)
endif()


#
# Define environment variables for special configurations
#
Expand All @@ -13,6 +18,9 @@ endif()

include_directories("${CMAKE_SOURCE_DIR}/csrc")


set (ENABLE_NUMA TRUE)

#
# Check the compile flags
#
Expand All @@ -22,13 +30,30 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
"-mf16c"
)
endif()
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-DVLLM_CPU_EXTENSION")

execute_process(COMMAND cat /proc/cpuinfo
RESULT_VARIABLE CPUINFO_RET
OUTPUT_VARIABLE CPUINFO)

if(MACOSX_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-Xpreprocessor"
"-fopenmp"
"-DVLLM_CPU_EXTENSION")
else()
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-DVLLM_CPU_EXTENSION")
endif()

if (MACOSX_FOUND)
execute_process(COMMAND uname -m
RESULT_VARIABLE CPUINFO_RET
OUTPUT_VARIABLE CPUINFO)
else()

execute_process(COMMAND cat /proc/cpuinfo
RESULT_VARIABLE CPUINFO_RET
OUTPUT_VARIABLE CPUINFO)
endif()


if (NOT CPUINFO_RET EQUAL 0)
message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
Expand Down Expand Up @@ -60,6 +85,8 @@ find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
find_isa(${CPUINFO} "arm64" APPLE_SILICON_FOUND)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this sufficiently unique to detect apple silicon?



if (AVX512_FOUND AND NOT AVX512_DISABLED)
list(APPEND CXX_COMPILE_FLAGS
Expand Down Expand Up @@ -103,6 +130,9 @@ elseif (ASIMD_FOUND)
set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
endif()
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
elseif(APPLE_SILICON_FOUND)
message(STATUS "Apple Silicon Detected")
set(ENABLE_NUMA OFF)
Comment on lines +129 to +131
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't apple silicon already match ASIMD_FOUND above? This is at least what happens when you build in a docker on apple silicon

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @mgoin, thank you for checking this!

I had second thoughts and I changed this detection on CMake.

The problem that I found is we can not run cat /proc/cpuinfo to get CPU info on Mac OS. So, my first approach was to change the command to use uname -m, trying to keep the same pattern. But using only the built-in variables from CMake we can detect more easy and I think more robust as well.

else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
endif()
Expand Down Expand Up @@ -139,7 +169,12 @@ endif()

message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")

list(APPEND LIBS numa)
if(ENABLE_NUMA)
list(APPEND LIBS numa)
else()
message("NUMA is disabled")
add_compile_definitions(-DVLLM_NUMA_DISABLED)
endif()

#
# _C extension
Expand Down
61 changes: 59 additions & 2 deletions csrc/cpu/cpu_types_arm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,68 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
}
}

// Note: below is the unrolled version of the following code:
//
// for (int i = 0; i < remainder; ++i) {
// reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] =
// vgetq_lane_f16(temp, i);
// }
//
// For mac os build (Clang), the arm/neon intrinsics function
wallashss marked this conversation as resolved.
Show resolved Hide resolved
// `vgetq_lane_f16` needs the parameter `i` to be constant at compile
// time.

if (remainder > 0) {
float16x8_t temp = reg.val[full_blocks];
for (int i = 0; i < remainder; ++i) {
reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
__fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
switch (remainder)
{
case 1:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
break;
case 2:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
break;
case 3:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
break;
case 4:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
break;
case 5:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
break;
case 6:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
break;
case 7:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
break;

default:
break;
}
}
}
Expand Down
23 changes: 18 additions & 5 deletions csrc/cpu/utils.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
#include <numa.h>
#include <unistd.h>
#include <string>
#include <sched.h>
#ifndef VLLM_NUMA_DISABLED
#include <numa.h>
#include <unistd.h>
#include <string>
#include <sched.h>
#endif

#include "cpu_types.hpp"

#ifdef VLLM_NUMA_DISABLED
std::string init_cpu_threads_env(const std::string& cpu_ids) {
return std::string(
"Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
"no effect to setup thread affinity.");
}

#endif

#ifndef VLLM_NUMA_DISABLED
std::string init_cpu_threads_env(const std::string& cpu_ids) {
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
TORCH_CHECK(omp_cpu_mask->size > 0);
Expand Down Expand Up @@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
omp_lock_t writelock;
omp_init_lock(&writelock);

#pragma omp parallel for schedule(static, 1)
#pragma omp parallel for schedule(static, 1)
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
cpu_set_t mask;
CPU_ZERO(&mask);
Expand Down Expand Up @@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {

return ss.str();
}
#endif
4 changes: 2 additions & 2 deletions docs/source/getting_started/arm-installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Installation for ARM CPUs

vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (including Apple Silicon). For additional details on supported features, refer to the x86 platform documentation covering:

- CPU backend inference capabilities
- Relevant runtime environment variables
Expand All @@ -20,7 +20,7 @@ Contents:
## Requirements

- **Operating System**: Linux or macOS
- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOs
wallashss marked this conversation as resolved.
Show resolved Hide resolved
- **Instruction Set Architecture (ISA)**: NEON support is required

(arm-backend-quick-start-dockerfile)=
Expand Down
6 changes: 3 additions & 3 deletions requirements-cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
-r requirements-common.txt

# Dependencies for CPUs
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64"
torch==2.5.1; platform_machine == "aarch64"
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin"
torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
datasets # for benchmark scripts
datasets # for benchmark scripts
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ def load_module_from_path(module_name, path):

VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE

if not sys.platform.startswith("linux"):
if not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")):
logger.warning(
"vLLM only supports Linux platform (including WSL). "
"vLLM only supports Linux platform (including WSL) and MacOS."
"Building on %s, "
"so vLLM may not be able to run correctly", sys.platform)
VLLM_TARGET_DEVICE = "empty"
Expand Down
Loading