{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T08:00:32Z","timestamp":1776931232088,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","funder":[{"name":"Plasma-PEPSC","award":["101093261"],"award-info":[{"award-number":["101093261"]}]},{"name":"SEANERGYS","award":["101177590"],"award-info":[{"award-number":["101177590"]}]},{"name":"PDExa","award":["16ME0641"],"award-info":[{"award-number":["16ME0641"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767518","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:18:44Z","timestamp":1762532324000},"page":"1574-1586","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MT4G: A Tool for Reliable Auto-Discovery of NVIDIA and AMD GPU Compute and Memory Topologies"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-4120-9472","authenticated-orcid":false,"given":"Stepan","family":"Vanecek","sequence":"first","affiliation":[{"name":"School of Computation, Information and Technology, Technical University of Munich, Garching, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9776-6994","authenticated-orcid":false,"given":"Manuel Walter","family":"Mu\u00dfbacher","sequence":"additional","affiliation":[{"name":"School of Computation, Information and Technology, Technical University of Munich, Garching, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7923-7466","authenticated-orcid":false,"given":"Dominik","family":"Gr\u00f6\u00dfler","sequence":"additional","affiliation":[{"name":"School of Computation, Information and Technology, Technical University of Munich, Garching, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0953-5048","authenticated-orcid":false,"given":"Urvij","family":"Saroliya","sequence":"additional","affiliation":[{"name":"School of Computation, Information and Technology, Technical University of Munich, Garching, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9013-435X","authenticated-orcid":false,"given":"Martin","family":"Schulz","sequence":"additional","affiliation":[{"name":"School of Computation, Information and Technology, Technical University of Munich, Garching, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","volume-title":"NVIDIA Multi-Instance GPU and NVIDIA Virtual Compute Server","year":"2020","unstructured":"2020. NVIDIA Multi-Instance GPU and NVIDIA Virtual Compute Server. Technical Report. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/design-visualization\/solutions\/resources\/documents1\/Technical-Brief-Multi-Instance-GPU-NVIDIA-Virtual-Compute-Server.pdf"},{"key":"e_1_3_3_2_3_2","volume-title":"Introducing AMD CDNA2 Architecture","author":"Inc. Advanced Micro Devices,","year":"2021","unstructured":"Advanced Micro Devices, Inc.2021. Introducing AMD CDNA2 Architecture. Advanced Micro Devices, Inc. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-business-docs\/white-papers\/amd-cdna2-white-paper.pdf"},{"key":"e_1_3_3_2_4_2","volume-title":"AMD Instinct MI200 Instruction Set Architecture Reference Guide","author":"Inc. Advanced Micro Devices,","year":"2022","unstructured":"Advanced Micro Devices, Inc.2022. AMD Instinct MI200 Instruction Set Architecture Reference Guide. Advanced Micro Devices, Inc. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-tech-docs\/instruction-set-architectures\/instinct-mi200-cdna2-instruction-set-architecture.pdf"},{"key":"e_1_3_3_2_5_2","volume-title":"HIP documentation","author":"Inc Advanced Micro Devices,","year":"2025","unstructured":"Advanced Micro Devices, Inc. 2025. HIP documentation. AMD ROCm. https:\/\/rocm.docs.amd.com\/projects\/HIP\/en\/docs-develop\/index.html"},{"key":"e_1_3_3_2_6_2","unstructured":"Advanced Micro Devices Inc.2025. HIP Runtime API: hipDeviceProp_t Struct Reference. https:\/\/rocm.docs.amd.com\/projects\/HIP\/en\/latest\/doxygen\/html\/structhip_device_prop__t.html"},{"key":"e_1_3_3_2_7_2","unstructured":"Advanced Micro Devices Inc.2025. ROCm System Management Interface (ROCm SMI) library. https:\/\/rocm.docs.amd.com\/projects\/rocm_smi_lib\/en\/latest\/"},{"key":"e_1_3_3_2_8_2","unstructured":"Advanced Micro Devices Inc.2025. ROCProfiler documentation. https:\/\/rocm.docs.amd.com\/projects\/rocprofiler\/en\/latest\/index.html"},{"key":"e_1_3_3_2_9_2","volume-title":"ROCProfiler documentation - ROCProfiler 2.0.0 Documentation","author":"Inc. Advanced Micro Devices,","year":"2025","unstructured":"Advanced Micro Devices, Inc.2025. ROCProfiler documentation - ROCProfiler 2.0.0 Documentation. Advanced Micro Devices, Inc. https:\/\/rocm.docs.amd.com\/projects\/rocprofiler\/en\/latest\/ Accessed: 2025-08-01."},{"key":"e_1_3_3_2_10_2","unstructured":"Paul Alcorn. 2024. AMD announces unified UDNA GPU architecture \u2014 bringing RDNA and CDNA together to take on Nvidia\u2019s CUDA ecosystem. Tom\u2019s Hardware (9 Sept. 2024). https:\/\/www.tomshardware.com\/pc-components\/cpus\/amd-announces-unified-udna-gpu-architecture-bringing-rdna-and-cdna-together-to-take-on-nvidias-cuda-ecosystem"},{"key":"e_1_3_3_2_11_2","unstructured":"AMD. 2024. AMD CDNA 3 Architecture Whitepaper. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-tech-docs\/white-papers\/amd-cdna-3-white-paper.pdf"},{"key":"e_1_3_3_2_12_2","volume-title":"Accelerator and GPU Hardware Specifications","author":"Team AMD ROCm Documentation","year":"2025","unstructured":"AMD ROCm Documentation Team. 2025. Accelerator and GPU Hardware Specifications. Advanced Micro Devices, Inc. https:\/\/rocm.docs.amd.com\/en\/latest\/reference\/gpu-arch-specs.html Accessed: 2025-07-29."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Samaneh Aminikhanghahi and Diane\u00a0J Cook. 2017. A survey of methods for time series change point detection. Knowledge and information systems 51 2 (2017) 339\u2013367.","DOI":"10.1007\/s10115-016-0987-z"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/PDP.2010.67"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","unstructured":"Fatih Camci. 2010. CHANGE POINT DETECTION IN TIME SERIES DATA USING SUPPORT VECTORS. International Journal of Pattern Recognition and Artificial Intelligence 24 (02 2010) 73\u201395. 10.1142\/S0218001410007865","DOI":"10.1142\/S0218001410007865"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","unstructured":"Gautam Chakrabarti Vinod Grover Bastiaan Aarts Xiangyun Kong Manjunath Kudlur Yuan Lin Jaydeep Marathe Mike Murphy and Jian-Zhong Wang. 2012. CUDA: Compiling and optimizing for a GPU platform. Procedia Computer Science 9 (12 2012) 1910\u20131919. 10.1016\/j.procs.2012.04.209","DOI":"10.1016\/j.procs.2012.04.209"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42613.2021.9365803"},{"key":"e_1_3_3_2_18_2","volume-title":"Nsight Compute Documentation - NsightCompute 12.9 documentation 2025","author":"Corporation NVIDIA","year":"2025","unstructured":"NVIDIA Corporation. 2025. Nsight Compute Documentation - NsightCompute 12.9 documentation 2025. NVIDIA Corporation. https:\/\/docs.nvidia.com\/nsight-compute\/ Accessed: 2025-08-01."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2015.7054183"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Klaus Frick Axel Munk and Hannes Sieling. 2014. Multiscale change point inference. Journal of the Royal Statistical Society Series B: Statistical Methodology 76 3 (2014) 495\u2013580.","DOI":"10.1111\/rssb.12047"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","unstructured":"Thomas Grundy Rebecca Killick and Gueorgui Mihaylov. 2020. High-dimensional changepoint detection via a geometrically inspired mapping. Statistics and Computing 30 4 (03 2020) 1155\u20131166. 10.1007\/s11222-020-09940-y","DOI":"10.1007\/s11222-020-09940-y"},{"key":"e_1_3_3_2_22_2","unstructured":"Dominik Gr\u00f6\u00dfler. 2022. Capturing the Memory Topology of GPUs. https:\/\/mediatum.ub.tum.de\/doc\/1689994\/document.pdf"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4959921"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"D.\u00a0L. Hill and P.\u00a0V. Rao. 1977. Tests of Symmetry Based on Cram\u00e9r-von Mises Statistics. Biometrika 64 3 (1977) 489\u2013494. http:\/\/www.jstor.org\/stable\/2345324","DOI":"10.1093\/biomet\/64.3.489"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555775"},{"key":"e_1_3_3_2_26_2","unstructured":"HSA Foundation. 2018. HSA Runtime Programmer\u2019s Reference Manual 1.2. https:\/\/hsafoundation.com\/wp-content\/uploads\/2021\/02\/HSA-Runtime-1.2.pdf"},{"key":"e_1_3_3_2_27_2","unstructured":"Aaron Jarmusch Nathan Graddon and Sunita Chandrasekaran. 2025. Dissecting the NVIDIA Blackwell Architecture with Microbenchmarks. arxiv:https:\/\/arXiv.org\/abs\/2507.10789\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2507.10789"},{"key":"e_1_3_3_2_28_2","unstructured":"Zhe Jia Marco Maggioni Jeffrey Smith and Daniele\u00a0Paolo Scarpazza. 2019. Dissecting the nvidia turing t4 gpu via microbenchmarking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1903.07486 (2019)."},{"key":"e_1_3_3_2_29_2","unstructured":"Zhe Jia Marco Maggioni Benjamin Staiger and Daniele\u00a0P Scarpazza. 2018. Dissecting the NVIDIA volta GPU architecture via microbenchmarking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1804.06826 (2018)."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Rebecca Killick Paul Fearnhead and Idris\u00a0A Eckley. 2012. Optimal detection of changepoints with a linear computational cost. J. Amer. Statist. Assoc. 107 500 (2012) 1590\u20131598.","DOI":"10.1080\/01621459.2012.737745"},{"key":"e_1_3_3_2_31_2","unstructured":"Ronny Krashinsky Olivier Giroux and Stephen Jones. 2020. Optimizing Applications for NVIDIA Ampere GPU Architecture. https:\/\/developer.download.nvidia.com\/video\/gputechconf\/gtc\/2020\/presentations\/s21819-optimizing-applications-for-nvidia-ampere-gpu-architecture.pdf Presented at NVIDIA GTC 2020."},{"key":"e_1_3_3_2_32_2","unstructured":"Chester Lam. 2023. AMD\u2019s Radeon Instinct MI210: GCN Lives On. https:\/\/chipsandcheese.com\/p\/amds-radeon-instinct-mi210-gcn-lives-on?open=false#%C2%A7cache-and-memory-access"},{"key":"e_1_3_3_2_33_2","unstructured":"Chester Lam. 2023. Nvidia\u2019s H100: Funny L2 and Tons of Bandwidth. https:\/\/chipsandcheese.com\/p\/nvidias-h100-funny-l2-and-tons-of-bandwidth"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-34625-8_5"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"crossref","unstructured":"C\u00e9line L\u00e9vy-Leduc and Fran\u00e7ois Roueff. 2009. Detection and localization of change-points in high-dimensional network traffic data. The Annals of Applied Statistics (2009) 637\u2013662.","DOI":"10.1214\/08-AOAS232"},{"key":"e_1_3_3_2_36_2","unstructured":"Linux Kernel Community. 2025. AMDGPU Driver Core Documentation. https:\/\/docs.kernel.org\/gpu\/amdgpu\/driver-core.html"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"crossref","unstructured":"Alexandre Lung-Yut-Fong C\u00e9line L\u00e9vy-Leduc and Olivier Capp\u00e9. 2011. Robust changepoint detection based on multivariate rank statistics. 2011 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP) (2011) 3608\u20133611. https:\/\/api.semanticscholar.org\/CorpusID:2975584","DOI":"10.1109\/ICASSP.2011.5946259"},{"key":"e_1_3_3_2_38_2","unstructured":"Weile Luo Ruibo Fan Zeyu Li Dayou Du Hongyuan Liu Qiang Wang and Xiaowen Chu. 2025. Dissecting the NVIDIA Hopper Architecture through Microbenchmarking and Multiple Level Analysis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.12084 (2025)."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"crossref","unstructured":"Frank\u00a0J Massey\u00a0Jr. 1951. The Kolmogorov-Smirnov test for goodness of fit. Journal of the American statistical Association 46 253 (1951) 68\u201378.","DOI":"10.1080\/01621459.1951.10500769"},{"key":"e_1_3_3_2_40_2","unstructured":"John McCalpin. 2006. STREAM: Sustainable memory bandwidth in high performance computers. http:\/\/www. cs. virginia. edu\/stream\/ (2006)."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"Xinxin Mei and Xiaowen Chu. 2016. Dissecting GPU memory hierarchy through microbenchmarking. IEEE Transactions on Parallel and Distributed Systems 28 1 (2016) 72\u201386.","DOI":"10.1109\/TPDS.2016.2549523"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.23919\/ISC.2025.11017506"},{"key":"e_1_3_3_2_43_2","unstructured":"Nvidia. 2016. NVIDIA Tesla P100 Whitepaper. https:\/\/images.nvidia.com\/content\/pdf\/tesla\/whitepaper\/pascal-architecture-whitepaper.pdf"},{"key":"e_1_3_3_2_44_2","unstructured":"NVIDIA. 2022. NVIDIA H100 Tensor Core GPU Architecture Whitepaper. https:\/\/resources.nvidia.com\/en-us-hopper-architecture\/nvidia-h100-tensor-c"},{"key":"e_1_3_3_2_45_2","unstructured":"NVIDIA. 2024. NVIDIA H100 Tensor Core GPU. https:\/\/resources.nvidia.com\/en-us-hopper-architecture\/nvidia-tensor-core-gpu-datasheet"},{"key":"e_1_3_3_2_46_2","unstructured":"NVIDIA Corporation. 2025. CUDA Runtime API: cudaDeviceProp Struct Reference. https:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/structcudaDeviceProp.html"},{"key":"e_1_3_3_2_47_2","unstructured":"NVIDIA Corporation. 2025. NVIDIA Management Library (NVML) Reference Guide. https:\/\/docs.nvidia.com\/deploy\/nvml-api\/nvml-api-reference.html Official documentation for the NVIDIA Management Library (NVML) providing APIs for monitoring and managing GPU device properties performance and system metrics.."},{"key":"e_1_3_3_2_48_2","unstructured":"NVIDIA Corporation. 2025. NVIDIA Nsight Compute Documentation. https:\/\/docs.nvidia.com\/nsight-compute\/index.html"},{"key":"e_1_3_3_2_49_2","volume-title":"Parallel Thread Execution ISA Version 8.8","author":"Corporation NVIDIA","year":"2025","unstructured":"NVIDIA Corporation. 2025. Parallel Thread Execution ISA Version 8.8. NVIDIA Corporation. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution Accessed: 2025-07-27."},{"key":"e_1_3_3_2_50_2","unstructured":"Stephan Rabanser Stephan G\u00fcnnemann and Zachary\u00a0C. Lipton. 2019. Failing Loudly: An Empirical Study of Methods for Detecting Dataset Shift. arxiv:https:\/\/arXiv.org\/abs\/1810.11953\u00a0[stat.ML] https:\/\/arxiv.org\/abs\/1810.11953"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624208"},{"key":"e_1_3_3_2_52_2","unstructured":"Tobias Stuckenberger. 2025. Visual Presentation of GPUscout GPU Bottleneck Analysis. Master\u2019s thesis. Technical University of Munich School of Computation Information and Technology - Informatics Munich Germany. Master\u2019s thesis."},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.1201\/b17279"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPPW.2010.38"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Charles Truong Laurent Oudre and Nicolas Vayatis. 2020. Selective review of offline change point detection methods. Signal Processing 167 (2020) 107299.","DOI":"10.1016\/j.sigpro.2019.107299"},{"key":"e_1_3_3_2_56_2","volume-title":"SC \u201923: The International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Vanecek Stepan","year":"2023","unstructured":"Stepan Vanecek and Martin Schulz. 2023. sys-sage: A Fresh View on Dynamic Topologies & Attributes of HPC Systems. In SC \u201923: The International Conference for High Performance Computing, Networking, Storage and Analysis. ACM, Denver, CO, USA. https:\/\/sc23.supercomputing.org\/proceedings\/tech_poster\/poster_files\/rpost114s3-file3.pdf Extended abstract for a research poster."},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3650200.3656627"},{"key":"e_1_3_3_2_58_2","volume-title":"Introduction to Robust Estimation and Hypothesis Testing. 5th Edition. San Diego, CA: Academic Press","author":"Wilcox Rand","year":"2021","unstructured":"Rand Wilcox. 2021. Introduction to Robust Estimation and Hypothesis Testing. 5th Edition. San Diego, CA: Academic Press."},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"crossref","unstructured":"Samuel Williams Andrew Waterman and David Patterson. 2009. Roofline: an insightful visual performance model for multicore architectures. Commun. ACM 52 4 (2009) 65\u201376.","DOI":"10.1145\/1498765.1498785"}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767518","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:34:05Z","timestamp":1767987245000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767518"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":58,"alternative-id":["10.1145\/3731599.3767518","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767518","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}