@inproceedings{b3c56cd73e534f62a53d8353fbed745a,
title = "Fail-slow fault tolerance needs programming support",
abstract = "The need for fail-slow fault tolerance in modern distributed systems is highlighted by the increasingly reported fail-slow hardware/software components that lead to poor performance system-wide. We argue that fail-slow fault tolerance not only needs new distributed protocol designs, but also desires programming support for implementing and verifying fail-slow fault-Tolerant code. Our observation is that the inability of tolerating fail-slow faults in existing distributed systems is often rooted in the implementations and is difficult to understand and debug. We designed the Dependably Fast Library (DepFast) for implementing fail-slow tolerant distributed systems. DepFast provides expressive interfaces for taking control of possible fail-slow points in the program to prevent unexpected slowness propagation once and for all. We use DepFast to implement a distributed replicated state machine (RSM) and show that it can tolerate various types of fail-slow faults that affect existing RSM implementations.",
keywords = "consensus, distributed systems, fail slow, fault tolerance",
author = "Andrew Yoo and Yuanli Wang and Ritesh Sinha and Shuai Mu and Tianyin Xu",
note = "Publisher Copyright: {\textcopyright} 2021 Owner/Author.; 18th Workshop on Hot Topics in Operating Systems, HotOS 2021 ; Conference date: 01-06-2021 Through 03-06-2021",
year = "2021",
month = jun,
day = "1",
doi = "10.1145/3458336.3465299",
language = "English",
series = "HotOS 2021 - Proceedings of the 2021 Workshop on Hot Topics in Operating Systems",
publisher = "Association for Computing Machinery, Inc",
pages = "228--235",
booktitle = "HotOS 2021 - Proceedings of the 2021 Workshop on Hot Topics in Operating Systems",
}