AWQ end-to-end example#
This is a simple example of the AWQ algorithm. Before running this script, make sure that amd-quark has been properly installed.
!pip install torch
!pip install transformers==4.52.1
!pip install tqdm
!pip install datasets
!pip install accelerate
For the AWQ algorithm, we provide default configurations for common models. For advanced users who want to use their own AWQ configuration, a configuration JSON file needs to be provided. In this ipynb example, we generate an AWQ configuration JSON file using Python.
import json
# Define the configuration to be written
awq_config = {
"name": "awq",
"scaling_layers": [
{
"prev_op": "input_layernorm",
"layers": ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],
"inp": "self_attn.q_proj",
"module2inspect": "self_attn",
},
{"prev_op": "self_attn.v_proj", "layers": ["self_attn.o_proj"], "inp": "self_attn.o_proj"},
{
"prev_op": "post_attention_layernorm",
"layers": ["mlp.gate_proj", "mlp.up_proj"],
"inp": "mlp.gate_proj",
"module2inspect": "mlp",
},
{"prev_op": "mlp.up_proj", "layers": ["mlp.down_proj"], "inp": "mlp.down_proj"},
],
"model_decoder_layers": "model.layers",
}
# Write configuration to a JSON file
with open("custom_awq_config.json", "w") as f:
json.dump(awq_config, f, indent=4)
print("custom_awq_config.json has been created.")
This is an example of using the AWQ algorithm integrated in Quark to quantize an LLM model. The example includes quantization, export and simple testing.
Import libraries#
First, we import the necessary libraries, including PyTorch, Transformers, Datasets, and the Quark quantization tools.
from typing import Any
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer
from quark.torch import LLMTemplate, ModelQuantizer, export_safetensors
from quark.torch.quantization.config.config import load_quant_algo_config_from_file
Data Preparation#
We need to prepare the calibration dataset and tokenizer. Here we use
the PileVal dataset for calibration. The get_pileval function
handles dataset loading and tokenization, while get_dataloader
creates the PyTorch DataLoader.
# -----------------------------
# Dataset / Tokenizer
# -----------------------------
def get_pileval(
tokenizer: PreTrainedTokenizer,
nsamples: int,
seqlen: int,
device: str | None,
seed: int = 0,
) -> torch.Tensor:
dataset: Any = load_dataset("mit-han-lab/pile-val-backup", split="validation").shuffle(seed=seed)
samples, n_run = [], 0
for data in dataset:
line_encoded = tokenizer.encode(data["text"].strip())
if 0 < len(line_encoded) <= seqlen:
samples.append(torch.tensor([line_encoded], device=device))
n_run += 1
if n_run == nsamples:
break
cat_samples = torch.cat(samples, dim=1)
n_split = cat_samples.shape[1] // seqlen
train_dataset = [cat_samples[:, i * seqlen : (i + 1) * seqlen] for i in range(n_split)]
return torch.cat(train_dataset, dim=0)
def get_tokenizer(model_id: str, max_seq_len: int = 512) -> PreTrainedTokenizer:
print(f"Initializing tokenizer from {model_id}")
tokenizer = AutoTokenizer.from_pretrained(
model_id,
model_max_length=max_seq_len,
padding_side="left",
trust_remote_code=True,
use_fast=False,
)
if tokenizer.pad_token != "<unk>":
tokenizer.pad_token = tokenizer.eos_token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
assert tokenizer.pad_token is not None, "Pad token cannot be set!"
return tokenizer
def get_dataloader(
tokenizer: PreTrainedTokenizer,
batch_size: int,
device: str | None,
seq_len: int = 512,
) -> DataLoader:
samples: torch.Tensor = get_pileval(tokenizer, nsamples=128, seqlen=seq_len, device=device, seed=42)
return DataLoader(samples, batch_size=batch_size, shuffle=False, drop_last=True)
Model Loading#
Load the pre-trained model from Hugging Face.
# -----------------------------
# Model / Quantization
# -----------------------------
def get_model(model_id: str, device: str | None) -> PreTrainedModel:
model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
model_id,
attn_implementation="eager",
torch_dtype="auto",
)
return model.eval().to(device)
Quantization Pipeline#
This function handles the core AWQ quantization process using Quark. It sets up the quantization configuration, initializes the quantizer, runs the quantization, and exports the model.
def quantize_model_pipeline(
model: PreTrainedModel,
calib_dataloader: DataLoader,
tokenizer: PreTrainedTokenizer,
) -> PreTrainedModel:
# Load custom AWQ config
custom_awq_config = load_quant_algo_config_from_file("custom_awq_config.json")
# If you don’t need a custom awq_config, you can omit it and use the default configuration.
template = LLMTemplate(
model_type=model.config.model_type,
exclude_layers_name=["lm_head"],
awq_config=custom_awq_config,
)
quant_config = template.get_config(scheme="uint4_wo_128", algorithm=["awq"])
quantizer = ModelQuantizer(quant_config, multi_device=True)
quantized_model: PreTrainedModel = quantizer.quantize_model(model, calib_dataloader)
print("[INFO] Export Quant Model.")
export_safetensors(model=quantized_model, output_dir="./")
tokenizer.save_pretrained("./")
return quantized_model
Evaluation#
We evaluate the quantized model using perplexity on the WikiText-2 dataset.
# -----------------------------
# Evaluation
# -----------------------------
@torch.no_grad()
def ppl_eval(
model: PreTrainedModel,
tokenizer: PreTrainedTokenizer,
device: str | None,
) -> torch.Tensor:
testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt").input_ids.to(device)
seqlen_for_eval = 2048
nsamples = testenc.numel() // seqlen_for_eval
nlls: list[torch.Tensor] = []
for i in tqdm(range(nsamples)):
batch = testenc[:, i * seqlen_for_eval : (i + 1) * seqlen_for_eval]
lm_logits = model(batch)["logits"]
shift_logits = lm_logits[:, :-1, :].contiguous()
shift_labels = batch[:, 1:]
loss = torch.nn.CrossEntropyLoss()(
shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1),
)
nlls.append(loss.float() * seqlen_for_eval)
ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * seqlen_for_eval))
return ppl
Run the Example#
Now we combine everything to run the AWQ quantization on Qwen2.5-0.5B.
# -----------------------------
# Pipeline
# -----------------------------
def run_quark_awq_example() -> None:
model_id = "Qwen/Qwen2.5-0.5B"
batch_size, seq_len = 4, 512
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Loading model: {model_id}")
model = get_model(model_id, device)
tokenizer = get_tokenizer(model_id, max_seq_len=seq_len)
calib_dataloader = get_dataloader(tokenizer, batch_size, device, seq_len)
print("[INFO] Starting quantization...")
quantized_model = quantize_model_pipeline(model, calib_dataloader, tokenizer)
print("[INFO] Quantization complete.")
print("[INFO] Simple test PPL with wikitext-2.")
ppl = ppl_eval(quantized_model, tokenizer, device)
print(f"[INFO] Perplexity: {ppl.item():.4f}")
if __name__ == "__main__":
with torch.no_grad():
run_quark_awq_example()