@conference{10902/34259, year = {2024}, url = {https://hdl.handle.net/10902/34259}, abstract = {A hotspot traffic pattern of communications can be a common phenomenon in HPC topologies that causes significant and lasting network performance degradation. This performance deterioration remains persistent over time, intensifying its impact even after the cessation of the detrimental traffic injection into the network. To understand its causes and effects, we analyze the network behavior under different hotspot traffic scenarios and compare the performance on various topologies. We examine both the performance drop due to traffic flows with endpoint contention, and the recovery process of the network after this phenomenon has occurred, if swift action is taken to mitigate it. Our results show that some topologies are more resilient to hotspot traffic than others, both to reduce the performance drop and/or to accelerate the recovery process. In particular, Flattened Butterfly is more resilient to congestion and consistently demonstrates a rapid recovery. The results of the analysis reinforce the need for mechanisms with effective and expeditious action to reduce the magnitude and duration of the performance drop. Furthermore, they highlight behavioral differences between topologies that can affect the effectiveness of mechanisms using congestion-based metrics.}, organization = {This work has been supported by Grants PID2019-105660RB-C22, TED2021-131176B-I00 and PID2022-136454NB-C21 funded by MICIU/AEI/ 10.13039/501100011033nd by ERDF/EU; by the Spanish Ministry of Science and Innovation Ramón y Cajal RYC2021-033959-I, and the European HiPEAC Network of Excellence. The experiments have been executed on the Altamira HPC cluster, at the Institute of Physics of Cantabria (IFCA-CSIC).}, publisher = {Association for Computing Machinery}, publisher = {SNTA '24: proceedings of the Seventh International Workshop on Systems and Network Telemetry and Analytics, Nueva York, Association for Computing Machinery, 2024. Pisa, 15-23}, title = {Defining the boundaries for endpoint congestion management in networks for high-performance computing}, author = {Postigo Díaz, Daniel and Herreros Cerro, David and Barón, Eloy and Camarero Coterillo, Cristobal and Fuentes Saez, Pablo}, }