{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T11:33:12Z","timestamp":1751369592174,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":73,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,3,28]],"date-time":"2022-03-28T00:00:00Z","timestamp":1648425600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"name":"National Natural Science Foundation of China","award":["U20A20226"],"award-info":[{"award-number":["U20A20226"]}]},{"name":"Beijing Natural Science Foundation","award":["4202031"],"award-info":[{"award-number":["4202031"]}]},{"name":"National Key R&D Program of China","award":["2021YFB0300300"],"award-info":[{"award-number":["2021YFB0300300"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,4,2]]},"DOI":"10.1145\/3503221.3508405","type":"proceedings-article","created":{"date-parts":[[2022,3,28]],"date-time":"2022-03-28T13:58:22Z","timestamp":1648475902000},"page":"177-191","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["PerFlow"],"prefix":"10.1145","author":[{"given":"Yuyang","family":"Jin","sequence":"first","affiliation":[{"name":"Tsinghua University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haojie","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Runxin","family":"Zhong","sequence":"additional","affiliation":[{"name":"Tsinghua University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jidong","family":"Zhai","sequence":"additional","affiliation":[{"name":"Tsinghua University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,3,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2021. PAPI tools. http:\/\/icl.utk.edu\/papi\/software\/"},{"volume-title":"Paraver homepage","key":"e_1_3_2_1_2_1","unstructured":"2021. Paraver homepage. Barcelona Supercomputing Center. http:\/\/www.bsc.es\/paraver"},{"volume-title":"Scalasca homepage","key":"e_1_3_2_1_3_1","unstructured":"2021. Scalasca homepage. Julich Supercomputing Centre and German Research School for Simulation Sciences. http:\/\/www.scalasca.org"},{"volume-title":"Score-P homepage","key":"e_1_3_2_1_4_1","unstructured":"2021. Score-P homepage. Score-P Consortium. http:\/\/www.score-p.org"},{"volume-title":"TAU homepage","key":"e_1_3_2_1_5_1","unstructured":"2021. TAU homepage. University of Oregon. http:\/\/tau.uoregon.edu"},{"volume-title":"Vampir homepage","key":"e_1_3_2_1_6_1","unstructured":"2021. Vampir homepage. Technical University Dresden. http:\/\/www.vampir.eu"},{"key":"e_1_3_2_1_7_1","volume-title":"12th USENIX symposium on operating systems design and implementation (OSDI'16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et al. 2016. Tensorflow: A system for large-scale machine learning. In 12th USENIX symposium on operating systems design and implementation (OSDI'16). 265--283."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.1553"},{"key":"e_1_3_2_1_9_1","volume-title":"Theano: A Python framework for fast computation of mathematical expressions. arXiv e-prints","author":"Al-Rfou Rami","year":"2016","unstructured":"Rami Al-Rfou, Guillaume Alain, Amjad Almahairi, Christof Angermueller, Dzmitry Bahdanau, Nicolas Ballas, Fr\u00e9d\u00e9ric Bastien, Justin Bayer, Anatoly Belikov, Alexander Belopolsky, et al. 2016. Theano: A Python framework for fast computation of mathematical expressions. arXiv e-prints (2016), arXiv-1605."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.9"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2723372.2742797"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2007.370254"},{"key":"e_1_3_2_1_13_1","volume-title":"available at: http:\/lammps.sandia.gov","author":"Molecular Massively Parallel Simulator Atomic","year":"2013","unstructured":"Large-scale Atomic and Molecular Massively Parallel Simulator. 2013. Lammps. available at: http:\/lammps.sandia.gov (2013)."},{"key":"e_1_3_2_1_14_1","unstructured":"D. Bailey T. Harris W. Saphir R. V. D. Wijngaart A. Woo and M. Yarrow. 1995. The NAS Parallel Benchmarks 2.0. NAS Systems Division NASA Ames Research Center Moffett Field CA."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2007.370238"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356173"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628100"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2010.18"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2012.120"},{"volume-title":"Characterizing Load and Communication Imbalance in Large-Scale Parallel Applications. In 2012 IEEE 26th International Parallel and Distributed Processing Symposium Workshops PhD Forum (IPDPSW'12)","author":"Bohme D.","key":"e_1_3_2_1_20_1","unstructured":"D. Bohme, F. Wolf, and M. Geimer. 2012. Characterizing Load and Communication Imbalance in Large-Scale Parallel Applications. In 2012 IEEE 26th International Parallel and Distributed Processing Symposium Workshops PhD Forum (IPDPSW'12). 2538--2541."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00019"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2010.5544927"},{"key":"e_1_3_2_1_23_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI'18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. TVM: An automated end-to-end optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI'18). 578--594."},{"key":"e_1_3_2_1_24_1","unstructured":"Gabor Csardi Tamas Nepusz et al. 2006. The igraph software package for complex network research. (2006)."},{"key":"e_1_3_2_1_25_1","volume-title":"Dataflow architectures. Annual review of computer science 1, 1","author":"Culler David E","year":"1986","unstructured":"David E Culler. 1986. Dataflow architectures. Annual review of computer science 1, 1 (1986), 225--253."},{"key":"e_1_3_2_1_26_1","volume-title":"Programl: Graph-based deep learning for program optimization and analysis. arXiv preprint arXiv:2003.10536","author":"Cummins Chris","year":"2020","unstructured":"Chris Cummins, Zacharias V Fisches, Tal Ben-Nun, Torsten Hoefler, and Hugh Leather. 2020. Programl: Graph-based deep learning for program optimization and analysis. arXiv preprint arXiv:2003.10536 (2020)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2815400.2815409"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/1327452.1327492"},{"key":"e_1_3_2_1_29_1","volume-title":"4th USENIX Symposium on Networked Systems Design & Implementation (NSDI'07)","author":"Fonseca Rodrigo","year":"2007","unstructured":"Rodrigo Fonseca, George Porter, Randy H Katz, and Scott Shenker. 2007. X-trace: A pervasive network tracing framework. In 4th USENIX Symposium on Networked Systems Design & Implementation (NSDI'07)."},{"volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC'08)","author":"Gamblin T.","key":"e_1_3_2_1_30_1","unstructured":"T. Gamblin, B.R. de Supinski, M. Schulz, R. Fowler, and D.A. Reed. 2008. Scalable load-balance measurement for SPMD codes. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC'08). 1--12."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.1556"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2018.00098"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/872726.806987"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 2011 USENIX Conference on Annual Technical Conference (USENIX ATC'11). 27--27","author":"Guo Zhenyu","year":"2011","unstructured":"Zhenyu Guo, Dong Zhou, Haoxiang Lin, Mao Yang, Fan Long, Chaoqiang Deng, Changshu Liu, and Lidong Zhou. 2011. G2: a graph processing system for diagnosing distributed systems. In Proceedings of the 2011 USENIX Conference on Annual Technical Conference (USENIX ATC'11). 27--27."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1086\/504594"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356220"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441585"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/1272996.1273005"},{"key":"e_1_3_2_1_39_1","volume-title":"Tools for High Performance Computing","author":"January Christopher","year":"2014","unstructured":"Christopher January, Jonathan Byrd, Xavier Or\u00f3, and Mark O'Connor. 2015. Allinea MAP: Adding Energy and OpenMP Profiling Without Increasing Overhead. In Tools for High Performance Computing 2014. Springer, 25--35."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00032"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3132747.3132749"},{"key":"e_1_3_2_1_43_1","volume-title":"Craypat-cray x1 performance analysis tool","author":"Kaufmann Steve","year":"2003","unstructured":"Steve Kaufmann and Bill Homer. 2003. Craypat-cray x1 performance analysis tool. Cray User Group (May 2003) (2003)."},{"volume-title":"Tools for high performance computing","author":"Kn\u00fcpfer Andreas","key":"e_1_3_2_1_44_1","unstructured":"Andreas Kn\u00fcpfer, Holger Brunst, Jens Doleschal, Matthias Jurenz, Matthias Lieber, Holger Mickler, Matthias S M\u00fcller, and Wolfgang E Nagel. 2008. The vampir performance analysis tool-set. In Tools for high performance computing. Springer, 139--155."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/BFb0024763"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC.2017.00030"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/1513895.1513905"},{"key":"e_1_3_2_1_48_1","volume-title":"VAMPIR: Visualization and analysis of MPI resources.","author":"Nagel Wolfgang E","year":"1996","unstructured":"Wolfgang E Nagel, Alfred Arnold, Michael Weber, Hans-Christian Hoppe, and Karl Solchenbach. 1996. VAMPIR: Visualization and analysis of MPI resources. (1996)."},{"key":"e_1_3_2_1_49_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in Neural Information Processing Systems (NeurIPS'19) 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in Neural Information Processing Systems (NeurIPS'19) 32 (2019), 8026--8037."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/1048935.1050204"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356209"},{"key":"e_1_3_2_1_52_1","volume-title":"VTune performance analyzer essentials","author":"Reinders James","year":"2005","unstructured":"James Reinders. 2005. VTune performance analyzer essentials. Intel Press (2005)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1137\/0217079"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.5555\/3195489.3195491"},{"key":"e_1_3_2_1_55_1","volume-title":"European Conference on Parallel Processing. Springer, 185--198","author":"Servat Harald","year":"2009","unstructured":"Harald Servat, Germ\u00e1n Llort, Judit Gim\u00e9nez, and Jes\u00fas Labarta. 2009. Detailed performance analysis using coarse grain sampling. In European Conference on Parallel Processing. Springer, 185--198."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342006064482"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00104"},{"key":"e_1_3_2_1_58_1","volume-title":"Mike Burrows, Pat Stephenson, Manoj Plakal, Donald Beaver, Saul Jaspan, and Chandan Shanbhag.","author":"Sigelman Benjamin H","year":"2010","unstructured":"Benjamin H Sigelman, Luiz Andre Barroso, Mike Burrows, Pat Stephenson, Manoj Plakal, Donald Beaver, Saul Jaspan, and Chandan Shanbhag. 2010. Dapper, a large-scale distributed systems tracing infrastructure. (2010)."},{"volume-title":"Proceedings of the 2010 ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis (SC'10)","author":"Tallent Nathan R.","key":"e_1_3_2_1_59_1","unstructured":"Nathan R. Tallent, Laksono Adhianto, and John M. Mellor-Crummey. 2010. Scalable Identification of Load Imbalance in Parallel Executions Using Call Path Profiles. In Proceedings of the 2010 ACM\/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis (SC'10). Washington, DC, USA, 1--11."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/1693453.1693489"},{"key":"e_1_3_2_1_61_1","volume-title":"Programmable inference accelerator. Retrieved August 1","author":"NVIDIA","year":"2019","unstructured":"NVIDIA TensorRT. 2019. Programmable inference accelerator. Retrieved August 1 (2019)."},{"key":"e_1_3_2_1_62_1","unstructured":"Jeffrey Vetter and Chris Chambreau. 2005. mpip: Lightweight scalable mpi profiling. (2005)."},{"key":"e_1_3_2_1_63_1","volume-title":"PET: Optimizing Tensor Programs with Partially Equivalent Transformations and Automated Corrections. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI'21)","author":"Wang Haojie","year":"2021","unstructured":"Haojie Wang, Jidong Zhai, Mingyu Gao, Zixuan Ma, Shizhi Tang, Liyan Zheng, Yuanzhi Li, Kaiyuan Rong, Yuanyong Chen, and Zhihao Jia. 2021. PET: Optimizing Tensor Programs with Partially Equivalent Transformations and Automated Corrections. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI'21). 37--54."},{"key":"e_1_3_2_1_64_1","volume-title":"Proceedings of the 2018 USENIX Conference on Annual Technical Conference (USENIX ATC'18). 561--574","author":"Wang Haojie","year":"2018","unstructured":"Haojie Wang, Jidong Zhai, Xiongchao Tang, Bowen Yu, Xiaosong Ma, and Wenguang Chen. 2018. Spindle: informed memory access monitoring. In Proceedings of the 2018 USENIX Conference on Annual Technical Conference (USENIX ATC'18). 561--574."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3332466.3374538"},{"key":"e_1_3_2_1_66_1","volume-title":"Tools for High Performance Computing","author":"Williams William R","year":"2015","unstructured":"William R Williams, Xiaozhu Meng, Benjamin Welton, and Barton P Miller. 2016. Dyninst and MRNet: Foundational infrastructure for parallel tools. In Tools for High Performance Computing 2015. Springer, 1--16."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/2929908.2929911"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/2931037.2931070"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.17"},{"key":"e_1_3_2_1_70_1","volume-title":"Proceedings of the 2017 USENIX conference on Annual Technical Conference (USENIX ATC'17). 181--193","author":"Zhang Hao","year":"2017","unstructured":"Hao Zhang, Zeyu Zheng, Shizhen Xu, Wei Dai, Qirong Ho, Xiaodan Liang, Zhiting Hu, Jinliang Wei, Pengtao Xie, and Eric P Xing. 2017. Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters. In Proceedings of the 2017 USENIX conference on Annual Technical Conference (USENIX ATC'17). 181--193."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359650"},{"key":"e_1_3_2_1_72_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, et al. 2020. Ansor: Generating high-performance tensor programs for deep learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20). 863--879."},{"key":"e_1_3_2_1_73_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI'18)","author":"Zhou Fang","year":"2018","unstructured":"Fang Zhou, Yifan Gan, Sixiang Ma, and Yang Wang. 2018. wPerf: generic Off-CPU analysis to identify bottleneck waiting events. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI'18). 527--543."}],"event":{"name":"PPoPP '22: 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"],"location":"Seoul Republic of Korea","acronym":"PPoPP '22"},"container-title":["Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503221.3508405","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503221.3508405","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:49Z","timestamp":1750186849000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503221.3508405"}},"subtitle":["a domain specific framework for automatic performance analysis of parallel applications"],"short-title":[],"issued":{"date-parts":[[2022,3,28]]},"references-count":73,"alternative-id":["10.1145\/3503221.3508405","10.1145\/3503221"],"URL":"https:\/\/doi.org\/10.1145\/3503221.3508405","relation":{},"subject":[],"published":{"date-parts":[[2022,3,28]]},"assertion":[{"value":"2022-03-28","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}