Skip to content

Commit

Permalink
add suggestion for quantization (#49)
Browse files Browse the repository at this point in the history
  • Loading branch information
juncongmoo authored Mar 23, 2023
1 parent 7f9fd6f commit 321d475
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 23 deletions.
18 changes: 14 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ optional arguments:

## 💎 Quantize LLaMA to run in a 4GB GPU

`pyllama` support quantization of 2/3/4/8/16-bit so that you can run model in a 4G memory GPU.
`pyllama` support quantization of 2/3/4/8-bit so that you can run model in a 4G memory GPU.

> You need to run `export HUGGING_FACE_HUB_TOKEN=XXX` to be able to access Hugging Face's data. You also need to install [gptq](https://pypi.org/project/gptq/) with command `pip install gptq`.
Expand All @@ -96,7 +96,7 @@ optional arguments:
--nsamples NSAMPLES Number of calibration data samples.
--percdamp PERCDAMP Percent of the average Hessian diagonal to use for dampening.
--nearest Whether to run the RTN baseline.
--wbits {2,3,4,8,16} bits for quauntization
--wbits {2,3,4,8} bits for quauntization
--groupsize GROUPSIZE
Groupsize to use for quantization; default uses full row.
--save SAVE Save quantized checkpoint under this name, eg pyllama-7B4b.pt.
Expand All @@ -114,6 +114,12 @@ optional arguments:
python -m llama.llama_quant decapoda-research/llama-7b-hf c4 --wbits 8 --save pyllama-7B8b.pt
```

- Quantize 7B model to 4-bit with groupsize 128 (the recommended setup 🔥)

```bash
python -m llama.llama_quant decapoda-research/llama-7b-hf c4 --wbits 4 --groupsize 128 --save pyllama-7B4b.pt
```

- Quantize 7B model to 2-bit

```bash
Expand All @@ -130,11 +136,15 @@ The download links for quantized LLaMA files are below:
| 3-bit | - | - | -|- |-|
| 4-bit | 3779485819 | - | cce9a3b522ddf5c011ee0174b2ff3dfb|- |-|
| 8-bit | 7017493231 | - | 2648b09597cf8f9e0d1a04cb70b71cab|- |-|
| 16-bit | - | - | -|- |-|
| 32-bit | - | - | -|- |-|


It took me 2 hours 40 mins to quantize the 65B model to 4bit. The file size is reduced from 122GB to 32GB.

> The following suggestions are recommended for LLM qunatization:
> 1. By default, use 4-bit quantization for LLM inference as it offers the total model bits and zero-shot accuracy trade-offs.
> 2. Use a block size of 128 or lower to stabilize 4-bit quantization and improve zero-shot performance.
> 3. Use a floating point or quantile quantization data type. In some cases, integer data types might be preferable to improve inference latency depending on the implementation and hardware support.
## 🔮 Single GPU Inference

### 🥥 Without Quantization
Expand Down
4 changes: 2 additions & 2 deletions llama/llama_infer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch

from transformers import AutoTokenizer
from llama.hf import LLaMATokenizer
from llama.hf.utils import get_llama
from llama.llama_quant import load_quant

Expand Down Expand Up @@ -72,7 +72,7 @@ def run(args=None):
dev = torch.device("cpu")

model.to(dev)
tokenizer = AutoTokenizer.from_pretrained(args.model)
tokenizer = LLaMATokenizer.from_pretrained(args.model)
input_ids = tokenizer.encode(args.text, return_tensors="pt").to(dev)

with torch.no_grad():
Expand Down
32 changes: 17 additions & 15 deletions llama/llama_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
quantize,
)

from llama.hf.modeling_llama import LLaMAForCausalLM
from llama.hf.configuration_llama import LLaMAConfig
from llama.hf import LLaMAForCausalLM, LLaMATokenizer, LLaMAConfig
from llama.hf.utils import avoid_tensor_modified, get_llama


Expand Down Expand Up @@ -67,17 +66,17 @@ def forward(self, inp, **kwargs):
for i in range(len(layers)):
layer = layers[i].to(dev)
subset = find_layers(layer)
gptq = {}
name_to_gptq = {}
for name in subset:
gptq[name] = GPTQ(subset[name])
gptq[name].quantizer = Quantizer()
gptq[name].quantizer.configure(
name_to_gptq[name] = GPTQ(subset[name])
name_to_gptq[name].quantizer = Quantizer()
name_to_gptq[name].quantizer.configure(
args.wbits, perchannel=True, sym=False, mse=False
)

def add_batch(name):
def tmp(_, inp, out):
gptq[name].add_batch(inp[0].data, out.data)
name_to_gptq[name].add_batch(inp[0].data, out.data)

return tmp

Expand All @@ -91,15 +90,15 @@ def tmp(_, inp, out):
print(f"\nQuantize layer: {i} ", end=',')
for name in subset:
print(name, end=",")
gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
quantizers["model.layers.%d.%s" % (i, name)] = gptq[name].quantizer
gptq[name].free()
name_to_gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
quantizers["model.layers.%d.%s" % (i, name)] = name_to_gptq[name].quantizer
name_to_gptq[name].free()
for j in range(args.nsamples):
outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]

layers[i] = layer.cpu()
del layer
del gptq
del name_to_gptq
torch.cuda.empty_cache()

inps, outs = outs, inps
Expand Down Expand Up @@ -215,16 +214,15 @@ def llama_pack(model, quantizers, wbits):
return model


def load_quant(model, checkpoint, wbits, seqlen=1024, for_infer=True):
def load_quant(model_name, checkpoint, wbits, seqlen=1024, for_infer=True):
"""
seqlen - seqlen refers to the maximum length of the input sequence that the model can process. The input sequence can be a sequence of words, tokens, or characters, depending on how the model is tokenized. The seqlen parameter is important because it determines the amount of memory that the model requires to process the input sequence. If the input sequence is too long, it may exceed the memory capacity of the model, leading to out-of-memory errors or slower inference times. In order to handle longer sequences, some models use techniques such as attention masking or truncation, which allow the model to process only a portion of the input sequence at a time. The seqlen parameter determines the maximum length of the input sequence that can be processed in a single step. If the input sequence is longer than the seqlen parameter, it may need to be split into multiple segments and processed separately.
"""
import transformers

config = LLaMAConfig.from_pretrained(model)
config = LLaMAConfig.from_pretrained(model_name)
avoid_tensor_modified()

torch.set_default_dtype(torch.half)
transformers.modeling_utils._init_weights = False
torch.set_default_dtype(torch.half)
model = LLaMAForCausalLM(config)
Expand Down Expand Up @@ -435,12 +433,16 @@ def run(args=None):
else:
dev = torch.device("cpu")

tokenizer = LLaMATokenizer.from_pretrained(
args.model, add_eos_token=True
)
dataloader, testloader = get_loaders(
args.dataset,
nsamples=args.nsamples,
seed=args.seed,
model=args.model,
seqlen=model.seqlen,
tokenizer=tokenizer
)

if not args.load and args.wbits < 16 and not args.nearest:
Expand All @@ -465,7 +467,7 @@ def run(args=None):
if args.eval:
for dataset in ["wikitext2", "ptb", "c4"]:
dataloader, testloader = get_loaders(
dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, tokenizer=tokenizer
)
print(dataset)
llama_eval(model, testloader, args, dev)
Expand Down
4 changes: 2 additions & 2 deletions quant_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ def main():
driver = hiq.HiQLatency(
hiq_table_or_path=[
["llama.llama_infer", "", "run", "run_quant"],
["llama.llama_infer", "AutoTokenizer", "from_pretrained", "from_pretrained"],
["transformers.models.llama.tokenization_llama", "LLaMATokenizer", "encode", "encode"],
["llama.llama_infer", "LLaMATokenizer", "from_pretrained", "from_pretrained"],
["llama.hf", "LLaMATokenizer", "encode", "encode"],
["llama.llama_infer", "", "load_quant", "load_quant"],
["llama.hf.modeling_llama","LLaMAForCausalLM","generate","generate"]
],
Expand Down

0 comments on commit 321d475

Please sign in to comment.