sgl-project · zhyncs · Dec 20, 2024 · Dec 16, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
@@ -309,6 +309,9 @@ def throughput_test(
         dataset_path=bench_args.dataset_path,
     )
 
+    import os, pwd
+    print(f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json")
+
     # Warm up
     if not bench_args.skip_warmup:
         logging.info("\nWarmup...")
@@ -322,6 +325,13 @@ def throughput_test(
         )
         time.sleep(0.5)
 
+    try:
+        from gemlite.core import GemLiteLinearTriton
+        import os, pwd
+        GemLiteLinearTriton.cache_config(f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json")
+    except ImportError:
+        pass
+
     logging.info("\nBenchmark...")
     result = throughput_test_once(
         backend_name=bench_args.backend,

diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py
@@ -385,6 +385,14 @@ def latency_test(
         8,  # shorter decoding to speed up the warmup
         server_args.device,
     )
+
+    try:
+        from gemlite.core import GemLiteLinearTriton
+        import os, pwd
+        GemLiteLinearTriton.cache_config(f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json")
+    except ImportError:
+        pass
+
     rank_print("Benchmark ...")
 
     # Run the sweep

@@ -47,6 +47,32 @@ def filter_fn(module, fqn):
             256,
         ], f"int4wo groupsize needs to be one of [32, 64, 128, 256] but got {group_size}"
         quantize_(model, int4_weight_only(group_size=group_size), filter_fn=filter_fn)
+    elif "gemlite" in torchao_config:
+        # gemlite-<packing_bitwidth>-<bit_width>-<group_size> or
+        # gemlite-<bit_width>-<group_size> (packing_bitwidth defaults to 32)
+        import os, pwd
+        import gemlite
+        from gemlite.core import GemLiteLinearTriton, set_autotune
+        from torchao.quantization import gemlite_uintx_weight_only
+
+        _quant_args = torchao_config.split("-")
+        bit_width = int(_quant_args[-2])
+        group_size = None if _quant_args[-1] == 'None' else int(_quant_args[-1])
+        try:
+            packing_bitwidth = int(_quant_args[-3])
+        except:
+            # if only 2 inputs found, use default value
+            packing_bitwidth = 32
+
+        quantize_(model, gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth))
+
+        # try to load gemlite kernel config
+        try:
+            GemLiteLinearTriton.load_config(f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json")
+            print(f"loaded gemlite kernel cache /tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json")
+        except:
+            print(f"unable to load gemlite kernel cache /tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json")
+
     elif "fp8wo" in torchao_config:
         # this requires newer hardware
         # [rank0]: AssertionError: fp8e4nv data type is not supported on CUDA arch < 89