add suggestion for quantization (#49)

juncongmoo · Mar 23, 2023 · 321d475 · 321d475
1 parent 7f9fd6f
commit 321d475
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -72,7 +72,7 @@ optional arguments:
 
 ## 💎 Quantize LLaMA to run in a 4GB GPU
 
-`pyllama` support quantization of 2/3/4/8/16-bit so that you can run model in a 4G memory GPU.
+`pyllama` support quantization of 2/3/4/8-bit so that you can run model in a 4G memory GPU.
 
 > You need to run `export HUGGING_FACE_HUB_TOKEN=XXX` to be able to access Hugging Face's data. You also need to install [gptq](https://pypi.org/project/gptq/) with command `pip install gptq`.
 
@@ -96,7 +96,7 @@ optional arguments:
   --nsamples NSAMPLES   Number of calibration data samples.
   --percdamp PERCDAMP   Percent of the average Hessian diagonal to use for dampening.
   --nearest             Whether to run the RTN baseline.
-  --wbits {2,3,4,8,16}  bits for quauntization
+  --wbits {2,3,4,8}  bits for quauntization
   --groupsize GROUPSIZE
                         Groupsize to use for quantization; default uses full row.
   --save SAVE           Save quantized checkpoint under this name, eg pyllama-7B4b.pt.
@@ -114,6 +114,12 @@ optional arguments:
 python -m llama.llama_quant decapoda-research/llama-7b-hf c4 --wbits 8 --save pyllama-7B8b.pt
 ```
 
+- Quantize 7B model to 4-bit with groupsize 128 (the recommended setup 🔥)
+
+```bash
+python -m llama.llama_quant decapoda-research/llama-7b-hf c4 --wbits 4 --groupsize 128 --save pyllama-7B4b.pt
+```
+
 - Quantize 7B model to 2-bit
 
 ```bash
@@ -130,11 +136,15 @@ The download links for quantized LLaMA files are below:
 | 3-bit |  - | - | -|- |-|
 | 4-bit |  3779485819 | - | cce9a3b522ddf5c011ee0174b2ff3dfb|- |-|
 | 8-bit |  7017493231 | - | 2648b09597cf8f9e0d1a04cb70b71cab|- |-|
-| 16-bit |  - | - | -|- |-|
-| 32-bit |  - | - | -|- |-|
+
 
 It took me 2 hours 40 mins to quantize the 65B model to 4bit. The file size is reduced from 122GB to 32GB.
 
+> The following suggestions are recommended for LLM qunatization:
+> 1. By default, use 4-bit quantization for LLM inference as it offers the total model bits and zero-shot accuracy trade-offs.
+> 2. Use a block size of 128 or lower to stabilize 4-bit quantization and improve zero-shot performance.
+> 3. Use a floating point or quantile quantization data type. In some cases, integer data types might be preferable to improve inference latency depending on the implementation and hardware support.
+
 ## 🔮 Single GPU Inference
 
 ### 🥥 Without Quantization

diff --git a/llama/llama_infer.py b/llama/llama_infer.py
@@ -1,6 +1,6 @@
 import torch
 
-from transformers import AutoTokenizer
+from llama.hf import LLaMATokenizer
 from llama.hf.utils import get_llama
 from llama.llama_quant import load_quant
 
@@ -72,7 +72,7 @@ def run(args=None):
         dev = torch.device("cpu")
 
     model.to(dev)
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenizer = LLaMATokenizer.from_pretrained(args.model)
     input_ids = tokenizer.encode(args.text, return_tensors="pt").to(dev)
 
     with torch.no_grad():

diff --git a/llama/llama_quant.py b/llama/llama_quant.py
@@ -13,8 +13,7 @@
     quantize,
 )
 
-from llama.hf.modeling_llama import LLaMAForCausalLM
-from llama.hf.configuration_llama import LLaMAConfig
+from llama.hf import LLaMAForCausalLM, LLaMATokenizer, LLaMAConfig
 from llama.hf.utils import avoid_tensor_modified, get_llama
 
 
@@ -67,17 +66,17 @@ def forward(self, inp, **kwargs):
     for i in range(len(layers)):
         layer = layers[i].to(dev)
         subset = find_layers(layer)
-        gptq = {}
+        name_to_gptq = {}
         for name in subset:
-            gptq[name] = GPTQ(subset[name])
-            gptq[name].quantizer = Quantizer()
-            gptq[name].quantizer.configure(
+            name_to_gptq[name] = GPTQ(subset[name])
+            name_to_gptq[name].quantizer = Quantizer()
+            name_to_gptq[name].quantizer.configure(
                 args.wbits, perchannel=True, sym=False, mse=False
             )
 
         def add_batch(name):
             def tmp(_, inp, out):
-                gptq[name].add_batch(inp[0].data, out.data)
+                name_to_gptq[name].add_batch(inp[0].data, out.data)
 
             return tmp
 
@@ -91,15 +90,15 @@ def tmp(_, inp, out):
         print(f"\nQuantize layer: {i} ", end=',')
         for name in subset:
             print(name, end=",")
-            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
-            quantizers["model.layers.%d.%s" % (i, name)] = gptq[name].quantizer
-            gptq[name].free()
+            name_to_gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
+            quantizers["model.layers.%d.%s" % (i, name)] = name_to_gptq[name].quantizer
+            name_to_gptq[name].free()
         for j in range(args.nsamples):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
 
         layers[i] = layer.cpu()
         del layer
-        del gptq
+        del name_to_gptq
         torch.cuda.empty_cache()
 
         inps, outs = outs, inps
@@ -215,16 +214,15 @@ def llama_pack(model, quantizers, wbits):
     return model
 
 
-def load_quant(model, checkpoint, wbits, seqlen=1024, for_infer=True):
+def load_quant(model_name, checkpoint, wbits, seqlen=1024, for_infer=True):
     """
     seqlen - seqlen refers to the maximum length of the input sequence that the model can process. The input sequence can be a sequence of words, tokens, or characters, depending on how the model is tokenized. The seqlen parameter is important because it determines the amount of memory that the model requires to process the input sequence. If the input sequence is too long, it may exceed the memory capacity of the model, leading to out-of-memory errors or slower inference times. In order to handle longer sequences, some models use techniques such as attention masking or truncation, which allow the model to process only a portion of the input sequence at a time. The seqlen parameter determines the maximum length of the input sequence that can be processed in a single step. If the input sequence is longer than the seqlen parameter, it may need to be split into multiple segments and processed separately.
     """
     import transformers
 
-    config = LLaMAConfig.from_pretrained(model)
+    config = LLaMAConfig.from_pretrained(model_name)
     avoid_tensor_modified()
 
-    torch.set_default_dtype(torch.half)
     transformers.modeling_utils._init_weights = False
     torch.set_default_dtype(torch.half)
     model = LLaMAForCausalLM(config)
@@ -435,12 +433,16 @@ def run(args=None):
     else:
         dev = torch.device("cpu")
 
+    tokenizer = LLaMATokenizer.from_pretrained(
+        args.model, add_eos_token=True
+    )
     dataloader, testloader = get_loaders(
         args.dataset,
         nsamples=args.nsamples,
         seed=args.seed,
         model=args.model,
         seqlen=model.seqlen,
+        tokenizer=tokenizer
     )
 
     if not args.load and args.wbits < 16 and not args.nearest:
@@ -465,7 +467,7 @@ def run(args=None):
     if args.eval:
         for dataset in ["wikitext2", "ptb", "c4"]:
             dataloader, testloader = get_loaders(
-                dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+                dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, tokenizer=tokenizer
             )
             print(dataset)
             llama_eval(model, testloader, args, dev)

diff --git a/quant_infer.py b/quant_infer.py
@@ -6,8 +6,8 @@ def main():
     driver = hiq.HiQLatency(
         hiq_table_or_path=[
           ["llama.llama_infer", "", "run", "run_quant"],
-          ["llama.llama_infer", "AutoTokenizer", "from_pretrained", "from_pretrained"],
-          ["transformers.models.llama.tokenization_llama", "LLaMATokenizer", "encode", "encode"],
+          ["llama.llama_infer", "LLaMATokenizer", "from_pretrained", "from_pretrained"],
+          ["llama.hf", "LLaMATokenizer", "encode", "encode"],
           ["llama.llama_infer", "", "load_quant", "load_quant"],
           ["llama.hf.modeling_llama","LLaMAForCausalLM","generate","generate"]
         ],