EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Transformers Freesound git Diagram Tensor BF16 Dataset WAN Heatmap OCR 论文 Linux 强化学习 Bipartite TTS Jetson MD5 Website WebCrawler InvalidArgumentError TensorFlow Knowledge Plate Permission PyTorch ONNX Nginx UNIX Base64 继承 LeetCode JSON tar FastAPI NLP Search 图形思考法 Translation 图标 Pandas ChatGPT GPT4 SVR Numpy Paddle Vim ModelScope BTC 搞笑 Safetensors PDF Zip CAM LaTeX Data Streamlit OpenCV SAM Hotel Random Llama Interview DeepSeek LoRA AI transformers UI TSV Qwen2.5 mmap 报税 Google Disk Pillow CLAP 域名 Tracking Docker SQL PIP Food Breakpoint 论文速读 v2ray Math XGBoost Excel Gemma Card 多进程 Color Template Markdown 证件照 uWSGI GPTQ FlashAttention 关于博主 Ubuntu Windows Pickle Hilton Firewall CSV Baidu Django News FP8 公式 VPN FP16 Vmess Qwen2 Magnet Bitcoin GIT NameSilo Image2Text Miniforge Anaconda diffusers 财报 Domain Proxy printf Datetime HaggingFace Agent QWEN GGML EXCEL Cloudreve 阿里云 BeautifulSoup 飞书 NLTK Tiktoken git-lfs RAR Conda Rebuttal Statistics 算法题 Bert SQLite VGG-16 Input 顶会 LLAMA ResNet-50 OpenAI Logo Sklearn Git LLM 签证 Python Plotly IndexTTS2 logger Video Pytorch Claude 递归学习法 Jupyter TensorRT YOLO Ptyhon CEIR 第一性原理 SPIE RGB Paper Mixtral DeepStream icon Michelin PDB PyCharm XML Shortcut Bin Land tqdm Animate Hungarian Github GoogLeNet FP32 Clash 净利润 Quantization VSCode COCO hf scipy Distillation 版权 Augmentation HuggingFace API 腾讯云 Crawler 多线程 torchinfo Qwen Review llama.cpp v0.dev CUDA C++ Algorithm CC Use uwsgi 云服务器 CTC Password CV Quantize Web FP64 Attention 音频
站点统计

本站现有博文327篇,共被浏览833143

本站已经建立2538天!

热门文章
文章归档
回到顶部