@inproceedings{18359420fdaa4a3588fadcea69367f7b,
title = "Improved MapReduce load balancing through distribution-dependent hash function optimization",
abstract = "Load balancing of skewed data in MapReduce systems like Hadoop is a well-studied problem. Many heuristics already exist to improve the load balance of the reducers thereby reducing the overall execution time. In this paper, we propose a lightweight optimization approach for MapReduce systems to minimize the makespan for repetitive tasks involving a typical frequency distribution. Our idea is to analyze the observed frequency distribution for the given task so as to identify an optimal offset parameter c to add in the hash function to minimize makespan. For two different bucketing methods - modulo labeling and consecutive binning - we present efficient algorithms for finding the optimal value of c. Finally, we present simulation results for both bucketing methods. The results vary with the data distribution and the number of reducers, but generally reduce makespan by 20\% on average for power-law distributions, Results are confirmed with experiments on well-known real-world data sets.",
keywords = "Apache Spark, Cloud Computing, Distributed-memory Cluster Computing, Hadoop, Hashing, Load Balancing, MapReduce",
author = "Zafar Ahmad and Sharmila Duppala and Rezaul Chowdhury and Steven Skiena",
note = "Publisher Copyright: {\textcopyright} 2020 IEEE.; 26th IEEE International Conference on Parallel and Distributed Systems, ICPADS 2020 ; Conference date: 02-12-2020 Through 04-12-2020",
year = "2020",
month = dec,
doi = "10.1109/ICPADS51040.2020.00012",
language = "English",
series = "Proceedings of the International Conference on Parallel and Distributed Systems - ICPADS",
publisher = "IEEE Computer Society",
pages = "9--18",
booktitle = "Proceedings - 2020 IEEE 26th International Conference on Parallel and Distributed Systems, ICPADS 2020",
}