From 1005b2bd69e0bee8d9df28b5a57fbef9b4aa6ff1 Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Wed, 8 Jan 2025 05:35:49 -0300
Subject: [PATCH] [Hardware][Apple] Native support for macOS Apple Silicon
 (#11696)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Signed-off-by: Fred Reiss <frreiss@us.ibm.com>
---
 cmake/cpu_extension.cmake                     | 61 ++++++++++++++-----
 csrc/cpu/cpu_types_arm.hpp                    | 61 ++++++++++++++++++-
 csrc/cpu/utils.cpp                            | 23 +++++--
 .../getting_started/installation/cpu-apple.md | 51 ++++++++++++++++
 .../getting_started/installation/cpu-arm.md   |  4 +-
 .../getting_started/installation/index.md     |  1 +
 requirements-cpu.txt                          |  6 +-
 setup.py                                      |  9 ++-
 vllm/config.py                                | 12 ++++
 vllm/entrypoints/openai/api_server.py         |  3 +
 vllm/utils.py                                 |  7 +++
 11 files changed, 209 insertions(+), 29 deletions(-)
 create mode 100644 docs/source/getting_started/installation/cpu-apple.md

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 68f7ca1af05ad..714abca2a5ff7 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(MACOSX_FOUND TRUE)
+endif()
+
+
 #
 # Define environment variables for special configurations
 #
@@ -13,6 +18,9 @@ endif()
 
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 
+
+set (ENABLE_NUMA TRUE)
+
 #
 # Check the compile flags
 #
@@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
         "-mf16c"
     )
 endif()
-list(APPEND CXX_COMPILE_FLAGS
-    "-fopenmp"
-    "-DVLLM_CPU_EXTENSION")
 
-execute_process(COMMAND cat /proc/cpuinfo
-                RESULT_VARIABLE CPUINFO_RET
-                OUTPUT_VARIABLE CPUINFO)
+if(MACOSX_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS
+        "-Xpreprocessor"
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+else()
+    list(APPEND CXX_COMPILE_FLAGS
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+endif()
 
-if (NOT CPUINFO_RET EQUAL 0)
-    message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+if (NOT MACOSX_FOUND)
+    execute_process(COMMAND cat /proc/cpuinfo
+                    RESULT_VARIABLE CPUINFO_RET
+                    OUTPUT_VARIABLE CPUINFO)
+    if (NOT CPUINFO_RET EQUAL 0)
+        message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+    endif()
 endif()
 
+
 function (find_isa CPUINFO TARGET OUT)
     string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
     if(NOT ISA_FOUND EQUAL -1)
@@ -54,12 +72,17 @@ endfunction()
 
 is_avx512_disabled(AVX512_DISABLED)
 
-find_isa(${CPUINFO} "avx2" AVX2_FOUND)
-find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
-find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
-find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
-find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
-find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set(APPLE_SILICON_FOUND TRUE)
+else()
+    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
+    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+endif()
+
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -103,6 +126,9 @@ elseif (ASIMD_FOUND)
         set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
     endif()
     list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+elseif(APPLE_SILICON_FOUND)
+    message(STATUS "Apple Silicon Detected")
+    set(ENABLE_NUMA OFF)
 else()
     message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
@@ -139,7 +165,12 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
-list(APPEND LIBS numa)
+if(ENABLE_NUMA)
+    list(APPEND LIBS numa)
+else()
+    message(STATUS "NUMA is disabled")
+    add_compile_definitions(-DVLLM_NUMA_DISABLED)
+endif()
 
 #
 # _C extension
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index 73e0f8cb2e0fb..ae062a5b86892 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -91,11 +91,68 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
                 vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
             }
         }
+
+        // Note: below is the unrolled version of the following code:
+        // 
+        // for (int i = 0; i < remainder; ++i) {
+        //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = 
+        //          vgetq_lane_f16(temp, i);
+        // }
+        // 
+        // For macOS build (Clang), the arm/neon intrinsics function 
+        // `vgetq_lane_f16` needs the parameter `i` to be constant at compile 
+        // time. 
         
         if (remainder > 0) {
             float16x8_t temp = reg.val[full_blocks];
-            for (int i = 0; i < remainder; ++i) {
-                reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
+            __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
+            switch (remainder)
+            {
+            case 1:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              break;
+            case 2:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              break;
+            case 3:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              break;
+            case 4:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              break;
+            case 5:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              break;
+            case 6:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+              break;
+            case 7:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+              fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
+              break;
+            
+            default:
+              break;
             }
         }
     }
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 1138a55df2f05..42a1c1d924bac 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -1,10 +1,22 @@
-#include <numa.h>
-#include <unistd.h>
-#include <string>
-#include <sched.h>
+#ifndef VLLM_NUMA_DISABLED
+  #include <numa.h>
+  #include <unistd.h>
+  #include <string>
+  #include <sched.h>
+#endif
 
 #include "cpu_types.hpp"
 
+#ifdef VLLM_NUMA_DISABLED
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
+  return std::string(
+      "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
+      "no effect to setup thread affinity.");
+}
+
+#endif
+
+#ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
   bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
   TORCH_CHECK(omp_cpu_mask->size > 0);
@@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   omp_lock_t writelock;
   omp_init_lock(&writelock);
 
-#pragma omp parallel for schedule(static, 1)
+  #pragma omp parallel for schedule(static, 1)
   for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
     cpu_set_t mask;
     CPU_ZERO(&mask);
@@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   return ss.str();
 }
+#endif
\ No newline at end of file
diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md
new file mode 100644
index 0000000000000..b55e4384d064d
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu-apple.md
@@ -0,0 +1,51 @@
+(installation-apple)=
+
+# Installation for macOS
+
+vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm)
+
+Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
+
+## Requirements
+
+- **Operating System**: `macOS Sonoma` or later
+- **SDK** `XCode 15.4` or later with Command Line Tools
+- **Compilers**: `Apple Clang >= 15.0.0`
+
+<!-- (arm-backend-quick-start-dockerfile)= -->
+
+## Build and installation
+
+After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
+
+```
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -r requirements-cpu.txt
+$ pip install -e . 
+```
+
+```{note}
+On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
+```
+
+
+
+## Troubleshooting
+
+If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your 
+[Command Line Tools for Xcode](https://developer.apple.com/download/all/).
+
+```
+[...] fatal error: 'map' file not found
+          1 | #include <map>
+            |          ^~~~~
+      1 error generated.
+      [2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o
+
+[...] fatal error: 'cstddef' file not found
+         10 | #include <cstddef>
+            |          ^~~~~~~~~
+      1 error generated.
+```
+
diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md
index a46e2c010600d..e199073ed721f 100644
--- a/docs/source/getting_started/installation/cpu-arm.md
+++ b/docs/source/getting_started/installation/cpu-arm.md
@@ -2,7 +2,7 @@
 
 # Installation for ARM CPUs
 
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
 
 - CPU backend inference capabilities
 - Relevant runtime environment variables
@@ -20,7 +20,7 @@ Contents:
 ## Requirements
 
 - **Operating System**: Linux or macOS
-- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
+- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS
 - **Instruction Set Architecture (ISA)**: NEON support is required
 
 (arm-backend-quick-start-dockerfile)=
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index 83de1aff409b2..0ebadca2ccec9 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -11,6 +11,7 @@ gpu-cuda
 gpu-rocm
 cpu-x86
 cpu-arm
+cpu-apple
 hpu-gaudi
 tpu
 xpu
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index e62f313297762..056fbf5a7adec 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for CPUs
-torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
-torch==2.5.1; platform_machine == "aarch64"
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
+torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
 torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
-datasets # for benchmark scripts
\ No newline at end of file
+datasets # for benchmark scripts
diff --git a/setup.py b/setup.py
index ef9f4e579e84d..b6c1f5bc8ac3f 100644
--- a/setup.py
+++ b/setup.py
@@ -34,9 +34,14 @@ def load_module_from_path(module_name, path):
 
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
-if not sys.platform.startswith("linux"):
+if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
     logger.warning(
-        "vLLM only supports Linux platform (including WSL). "
+        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
+    VLLM_TARGET_DEVICE = "cpu"
+elif not (sys.platform.startswith("linux")
+          or sys.platform.startswith("darwin")):
+    logger.warning(
+        "vLLM only supports Linux platform (including WSL) and MacOS."
         "Building on %s, "
         "so vLLM may not be able to run correctly", sys.platform)
     VLLM_TARGET_DEVICE = "empty"
diff --git a/vllm/config.py b/vllm/config.py
index 44426489f686a..535cbe97a311a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,6 +4,7 @@
 import hashlib
 import json
 import os
+import sys
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
@@ -2259,6 +2260,17 @@ def _get_and_verify_dtype(
                     "supported for POWERPC.")
                 torch_dtype = torch.bfloat16
 
+            # TODO: change this condition to check if the platform support bf16
+            # instead of checking the OS. For instance M2 shall supports bf16
+            # already. But we need to modify `cpu_extension.cmake` to activate
+            # the feature in the build.
+            if (current_platform.is_cpu() and sys.platform.startswith("darwin")
+                    and current_platform.get_cpu_architecture()
+                    == CpuArchEnum.ARM and config_dtype == torch.bfloat16):
+                logger.info("For macOS with Apple Silicon, currently bfloat16 "
+                            "is not supported. Setting dtype to float16.")
+                torch_dtype = torch.float16
+
             if current_platform.is_hpu() and config_dtype == torch.float16:
                 logger.info(
                     "For HPU, we cast models to bfloat16 instead of"
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 4c368b469f252..a91e4792ae0b0 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -7,6 +7,7 @@
 import re
 import signal
 import socket
+import sys
 import tempfile
 import uuid
 from argparse import Namespace
@@ -807,6 +808,8 @@ def signal_handler(*_) -> None:
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
             ssl_cert_reqs=args.ssl_cert_reqs,
+            # Workaround to work on macOS
+            fd=sock.fileno() if sys.platform.startswith("darwin") else None,
             **uvicorn_kwargs,
         )
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 2660b53d7bfb0..c09cae70e9af8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -524,6 +524,13 @@ def get_open_port() -> int:
 
 
 def find_process_using_port(port: int) -> Optional[psutil.Process]:
+    # TODO: We can not check for running processes with network
+    # port on macOS. Therefore, we can not have a full graceful shutdown
+    # of vLLM. For now, let's not look for processes in this case.
+    # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
+    if sys.platform.startswith("darwin"):
+        return None
+
     for conn in psutil.net_connections():
         if conn.laddr.port == port:
             try: