@inproceedings{667be97fc8c74079851108ed870b5fb5,
title = "An Efficient Skinny Matrix-Matrix Multiplication Method by Folding Input Matrices into Tensor Core Operations",
abstract = "A specialized unit in NVIDIA's GPUs, called Tensor Core, keeps attracting attention in the last couple of years due to its high computing capability for general matrix-matrix multiplications (GEMMs). A Tensor Core unit is capable of calculating a matrix multiply-accumulate (MMA) operation of a specific size. However, if the size of input matrices is skinner than that of a Tensor Core operation, some computations of a Tensor Core operation become wasted. Thus, this paper presents a method to optimize the calculation of skinny matrix-matrix multiplication that exploits the potential of the Tensor core units. The proposed method feeds multiple segments of an input matrix into a Tensor Core operation to utilize more computations. The experimental results show that the proposed method achieves up to a 2.7× speedup compared with the cuBLAS 11.0 library.",
keywords = "GEMM, GPU, Tensor Core, optimization, tall-and-skinny",
author = "Hao Tang and Kazuhiko Komatsu and Masayuki Sato and Hiroaki Kobayashi",
note = "Publisher Copyright: {\textcopyright} 2020 IEEE.; 8th International Symposium on Computing and Networking Workshops, CANDARW 2020 ; Conference date: 24-11-2020 Through 27-11-2020",
year = "2020",
month = nov,
doi = "10.1109/CANDARW51189.2020.00041",
language = "English",
series = "Proceedings - 2020 8th International Symposium on Computing and Networking Workshops, CANDARW 2020",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "164--167",
booktitle = "Proceedings - 2020 8th International Symposium on Computing and Networking Workshops, CANDARW 2020",
address = "United States",
}