@inproceedings{d9ff098c7ad442d7b76e8b4b72e5ae85,
title = "Shedding the Bits: Pushing the Boundaries of Quantization with Minifloats on FPGAs",
abstract = "Post-training quantization (PTQ) is a powerful technique for model compression, reducing the numerical precision in neural networks without additional training overhead. Recent works have investigated adopting 8 -bit floating-point formats (FP8) in the context of PTQ for model inference. However, floating-point formats smaller than 8 bits and their relative comparison in terms of accuracy-hardware cost with integers remains unexplored on FPGAs. In this work, we present minifloats, which are reduced-precision floating-point formats capable of further reducing the memory footprint, latency, and energy cost of a model while approaching full-precision model accuracy. We implement a custom FPGA-based multiply-accumulate operator library and explore the vast design space, comparing minifloat and integer representations across 3 to 8 bits for both weights and activations. We also examine the applicability of various integer-based quantization techniques to minifloats. Our experiments show that minifloats offer a promising alternative for emerging workloads such as vision transformers.",
keywords = "minifloats, multiply-accumulate, quantization",
author = "Shivam Aggarwal and Damsgaard, {Hans Jakob} and Alessandro Pappalardo and Giuseppe Franco and Preu{\ss}er, {Thomas B.} and Michaela Blott and Tulika Mitra",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; International Conference on Field-Programmable Logic and Applications ; Conference date: 02-09-2024 Through 06-09-2024",
year = "2024",
doi = "10.1109/FPL64840.2024.00048",
language = "English",
publisher = "IEEE",
pages = "297--303",
booktitle = "Proceedings - 2024 34th International Conference on Field-Programmable Logic and Applications, FPL 2024",
address = "United States",
}