@inproceedings{2dc555a9dc344958a75733446d07b31e,
title = "KACE: Kernel-Aware Colocation for Efficient GPU Spatial Sharing",
abstract = "GPU spatial sharing among jobs is an effective approach to increase resource utilization and reduce the monetary and environmental costs of running deep learning workloads. While hardware support for GPU spatial sharing already exists, accurately predicting GPU interference between colocated workloads remains a concern. This makes it challenging to improve GPU utilization by sharing the GPU between workloads without severely impacting their performance. Existing approaches to identify and mitigate GPU interference often require extensive profiling and/or hardware modifications, making them difficult to deploy in practice. This paper presents KACE, a lightweight, prediction-based approach to effectively colocate workloads on a given GPU. KACE adequately predicts colocation interference via exclusive kernel metrics using limited training data and minimal training time, eliminating the need for extensive online profiling of each new workload colocation. Experimental results using various training and inference workloads show that KACE outperforms existing rule-based and prediction-based policies by 16\% and 11\%, on average, respectively, and is within 10\% of the performance achieved by an offline-optimal oracle policy.",
keywords = "Cloud Computing, GPU Sharing, Systems for ML",
author = "Han, \{Bing Shiun\} and Tathagata Paul and Zhenhua Liu and Anshul Gandhi",
note = "Publisher Copyright: {\textcopyright} 2024 ACM.; 15th Annual ACM Symposium on Cloud Computing, SoCC 2024 ; Conference date: 20-11-2024 Through 22-11-2024",
year = "2024",
month = nov,
day = "20",
doi = "10.1145/3698038.3698555",
language = "English",
series = "SoCC 2024 - Proceedings of the 2024 ACM Symposium on Cloud Computing",
publisher = "Association for Computing Machinery, Inc",
pages = "460--469",
booktitle = "SoCC 2024 - Proceedings of the 2024 ACM Symposium on Cloud Computing",
}