{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T08:16:32Z","timestamp":1769501792756,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","funder":[{"name":"NSERC","award":["RGPIN-2025-05285"],"award-info":[{"award-number":["RGPIN-2025-05285"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756105","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"1491-1504","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["TransFusion: End-to-End Transformer Acceleration via Graph Fusion and Pipelining"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6760-5504","authenticated-orcid":false,"given":"Linxuan","family":"Zhang","sequence":"first","affiliation":[{"name":"The Department of Electrical and Computer Engineering, University of Alberta, Edmonton, Alberta, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9943-1809","authenticated-orcid":false,"given":"J. Nelson","family":"Amaral","sequence":"additional","affiliation":[{"name":"Department of Computing Science, University of Alberta, Edmonton, Alberta, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5250-7327","authenticated-orcid":false,"given":"Di","family":"Niu","sequence":"additional","affiliation":[{"name":"The Department of Electrical and Computer Engineering, University of Alberta, Edmonton, Alberta, Canada"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Apple. 2024. Core ML Tools. https:\/\/apple.github.io\/coremltools\/docs-guides\/source\/opt-palettization-overview.html."},{"key":"e_1_3_3_2_3_2","unstructured":"Minsik Cho Keivan Alizadeh-Vahid Saurabh\u00a0N. Adya and Mohammad Rastegari. 2021. DKM: Differentiable K-Means Clustering Layer for Neural Network Compression. ArXiv abs\/2108.12659 (2021). https:\/\/api.semanticscholar.org\/CorpusID:237353080"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC55918.2022.00018"},{"key":"e_1_3_3_2_5_2","volume-title":"Advances in Neural Information Processing Systems","author":"CONNEAU Alexis","year":"2019","unstructured":"Alexis CONNEAU and Guillaume Lample. 2019. Cross-lingual Language Model Pretraining. In Advances in Neural Information Processing Systems , H.\u00a0Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d'Alch\u00e9-Buc, E.\u00a0Fox, and R.\u00a0Garnett (Eds.), Vol.\u00a032. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/c04c19c2c2474dbf5f7ac4372c5b9af1-Paper.pdf"},{"key":"e_1_3_3_2_6_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Dao Tri","year":"2024","unstructured":"Tri Dao. 2024. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_7_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Daniel\u00a0Y. Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_8_2","series-title":"(NIPS \u201922)","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS \u201922). Curran Associates Inc., Red Hook, NY, USA, Article 2198, 15\u00a0pages."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_3_2_10_2","unstructured":"Elias Frantar Saleh Ashkboos Torsten Hoefler and Dan Alistarh. 2022. GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.17323 (2022)."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","unstructured":"Prakhar Ganesh Yao Chen Xin Lou Mohammad\u00a0Ali Khan Yin Yang Hassan Sajjad Preslav Nakov Deming Chen and Marianne Winslett. 2021. Compressing Large-Scale Transformer-Based Models: A Case Study on BERT. Transactions of the Association for Computational Linguistics 9 (2021) 1061\u20131080. 10.1162\/tacl_a_00413","DOI":"10.1162\/tacl_a_00413"},{"key":"e_1_3_3_2_12_2","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et\u00a0al. 2024. The Llama 3 Herd of Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21783 (2024)."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589038"},{"key":"e_1_3_3_2_14_2","unstructured":"Yatharth Gupta Vishnu\u00a0V. Jaddipal Harish Prabhala Sayak Paul and Patrick von Platen. 2024. Progressive Knowledge Distillation Of Stable Diffusion XL Using Layer Level Loss. ArXiv abs\/2401.02677 (2024). https:\/\/api.semanticscholar.org\/CorpusID:266818179"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Tae\u00a0Jun Ham Sungjun Jung Seonghak Kim Young\u00a0H. Oh Yeonhong Park Yoonho Song Jung-Hun Park Sanghee Lee Kyoung Park Jae\u00a0W. Lee and Deog-Kyoon Jeong. 2020. A3: Accelerating Attention Mechanisms in Neural Networks with Approximation. 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA) (2020) 328\u2013341. https:\/\/api.semanticscholar.org\/CorpusID:211296403","DOI":"10.1109\/HPCA47549.2020.00035"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00060"},{"key":"e_1_3_3_2_17_2","first-page":"148","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Hong Ke","year":"2024","unstructured":"Ke Hong, Guohao Dai, Jiaming Xu, Qiuli Mao, Xiuhong Li, Jun Liu, Kangdi Chen, Yuhan Dong, and Yu Wang. 2024. FlashDecoding++: Faster Large Language Model Inference with Asynchronization, Flat GEMM Optimization, and Heuristics. In Proceedings of Machine Learning and Systems , P.\u00a0Gibbons, G.\u00a0Pekhimenko, and C.\u00a0De Sa (Eds.), Vol.\u00a06. 148\u2013161. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/file\/5321b1dabcd2be188d796c21b733e8c7-Paper-Conference.pdf"},{"key":"e_1_3_3_2_18_2","series-title":"(NIPS \u201923)","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Huang Tao","year":"2023","unstructured":"Tao Huang, Yuan Zhang, Mingkai Zheng, Shan You, Fei Wang, Chen Qian, and Chang Xu. 2023. Knowledge Diffusion for Distillation. In Proceedings of the 37th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS \u201923). Curran Associates Inc., Red Hook, NY, USA, Article 2849, 18\u00a0pages."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575747"},{"key":"e_1_3_3_2_20_2","unstructured":"Guillaume Lample and Alexis Conneau. 2019. Cross-lingual Language Model Pretraining. ArXiv abs\/1901.07291 (2019). https:\/\/api.semanticscholar.org\/CorpusID:58981712"},{"key":"e_1_3_3_2_21_2","unstructured":"Benjamin Lefaudeux Francisco Massa Diana Liskovich Wenhan Xiong Vittorio Caggiano Sean Naren Min Xu Jieru Hu Marta Tintore Susan Zhang Patrick Labatut Daniel Haziza Luca Wehrstedt Jeremy Reizenstein and Grigory Sizov. 2022. xFormers: A modular and hackable Transformer modelling library. https:\/\/github.com\/facebookresearch\/xformers."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO57630.2024.10444871"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547911"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"Wenjie Li Dongxu Lyu Gang Wang Aokun Hu Ningyi Xu and Guanghui He. 2024. Hardware-oriented algorithms for softmax and layer normalization of large language models. Science China Information Sciences 67 10 (2024) 200404.","DOI":"10.1007\/s11432-024-4137-4"},{"key":"e_1_3_3_2_25_2","series-title":"(NIPS \u201922)","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Li Yanjing","year":"2022","unstructured":"Yanjing Li, Sheng Xu, Baochang Zhang, Xianbin Cao, Peng Gao, and Guodong Guo. 2022. Q-ViT: Accurate and Fully Quantized Low-bit Vision Transformer. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS \u201922). Curran Associates Inc., Red Hook, NY, USA, Article 2496, 13\u00a0pages."},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01565"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.24963\/IJCAI.2022\/164"},{"key":"e_1_3_3_2_28_2","series-title":"(NIPS \u201921)","volume-title":"Proceedings of the 35th International Conference on Neural Information Processing Systems","author":"Liu Zhenhua","year":"2021","unstructured":"Zhenhua Liu, Yunhe Wang, Kai Han, Wei Zhang, Siwei Ma, and Wen Gao. 2021. Post-Training Quantization for Vision Transformer. In Proceedings of the 35th International Conference on Neural Information Processing Systems(NIPS \u201921). Curran Associates Inc., Red Hook, NY, USA, Article 2152, 12\u00a0pages."},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480125"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","unstructured":"Jiachen Mao Huanrui Yang Ang Li Hai Li and Yiran Chen. 2021. TPrune: Efficient Transformer Pruning for Mobile Devices. ACM Trans. Cyber-Phys. Syst. 5 3 Article 26 (April 2021) 22\u00a0pages. 10.1145\/3446640","DOI":"10.1145\/3446640"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623791"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00107"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","unstructured":"Thomas Norrie Nishant Patil Doe\u00a0Hyun Yoon George Kurian Sheng Li James Laudon Cliff Young Norman Jouppi and David Patterson. 2021. The Design Process for Google\u2019s Training Chips: TPUv2 and TPUv3. IEEE Micro 41 2 (2021) 56\u201363. 10.1109\/MM.2021.3058217","DOI":"10.1109\/MM.2021.3058217"},{"key":"e_1_3_3_2_34_2","unstructured":"Nvidia. 2024. TensorRT. https:\/\/docs.nvidia.com\/deeplearning\/tensorrt\/ archives\/tensorrt-803\/best-practices\/index.html."},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISQED51717.2021.9424344"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","unstructured":"Tairen Piao Ikhyun Cho and U. Kang. 2022. SensiMix: Sensitivity-Aware 8-bit index & 1-bit value mixed precision quantization for BERT compression. PLOS ONE 17 4 (04 2022) 1\u201322. 10.1371\/journal.pone.0265621","DOI":"10.1371\/journal.pone.0265621"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507738"},{"key":"e_1_3_3_2_39_2","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et\u00a0al. 2018. Improving Language Understanding by Generative Pre-Training. (2018)."},{"key":"e_1_3_3_2_40_2","unstructured":"Colin Raffel Noam Shazeer Adam Roberts Katherine Lee Sharan Narang Michael Matena Yanqi Zhou Wei Li and Peter\u00a0J. Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. Journal of Machine Learning Research 21 140 (2020) 1\u201367. http:\/\/jmlr.org\/papers\/v21\/20-074.html"},{"key":"e_1_3_3_2_41_2","first-page":"68658","volume-title":"Advances in Neural Information Processing Systems","volume":"37","author":"Shah Jay","year":"2024","unstructured":"Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, and Tri Dao. 2024. FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision. In Advances in Neural Information Processing Systems , A.\u00a0Globerson, L.\u00a0Mackey, D.\u00a0Belgrave, A.\u00a0Fan, U.\u00a0Paquet, J.\u00a0Tomczak, and C.\u00a0Zhang (Eds.), Vol.\u00a037. Curran Associates, Inc., 68658\u201368685. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2024\/file\/7ede97c3e082c6df10a8d6103a2eebd2-Paper-Conference.pdf"},{"key":"e_1_3_3_2_42_2","unstructured":"Mohammadali Shakerdargah Shan Lu Chao Gao and Di Niu. 2024. MAS-Attention: Memory-Aware Stream Processing for Attention Acceleration on Resource-Constrained Edge Devices. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.17720 (2024)."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1441"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSD53832.2021.00074"},{"key":"e_1_3_3_2_45_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arxiv:https:\/\/arXiv.org\/abs\/2302.13971\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_3_2_46_2","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141\u00a0ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems , I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.), Vol.\u00a030. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_3_2_48_2","series-title":"(NIPS \u201922)","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Wang Naigang","year":"2022","unstructured":"Naigang Wang, Chi-Chun Liu, Swagath Venkataramani, Sanchari Sen, Chia-Yu Chen, Kaoutar El\u00a0Maghraoui, Vijayalakshmi Srinivasan, and Leland Chang. 2022. Deep Compression of Pre-trained Transformer Models. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS \u201922). Curran Associates Inc., Red Hook, NY, USA, Article 1028, 15\u00a0pages."},{"key":"e_1_3_3_2_49_2","unstructured":"Shuohang Wang Luowei Zhou Zhe Gan Yen-Chun Chen Yuwei Fang Siqi Sun Yu Cheng and Jingjing Liu. 2020. Cluster-Former: Clustering-based Sparse Transformer for Long-Range Dependency Encoding. ArXiv abs\/2009.06097 (2020). https:\/\/api.semanticscholar.org\/CorpusID:260424300"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.188"},{"key":"e_1_3_3_2_51_2","series-title":"(NIPS \u201920)","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Wang Wenhui","year":"2020","unstructured":"Wenhui Wang, Furu Wei, Li Dong, Hangbo Bao, Nan Yang, and Ming Zhou. 2020. MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers. In Proceedings of the 34th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS \u201920). Curran Associates Inc., Red Hook, NY, USA, Article 485, 13\u00a0pages."},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD45719.2019.8942149"},{"key":"e_1_3_3_2_53_2","series-title":"(NIPS \u201922)","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Yao Zhewei","year":"2022","unstructured":"Zhewei Yao, Reza\u00a0Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, and Yuxiong He. 2022. ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS \u201922). Curran Associates Inc., Red Hook, NY, USA, Article 1970, 16\u00a0pages."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02170"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","unstructured":"Fang Yu Kun Huang Meng Wang Yuan Cheng Wei Chu and Li Cui. 2022. Width & Depth Pruning for Vision Transformers. Proceedings of the AAAI Conference on Artificial Intelligence 36 3 (Jun. 2022) 3143\u20133151. 10.1609\/aaai.v36i3.20222","DOI":"10.1609\/aaai.v36i3.20222"},{"key":"e_1_3_3_2_56_2","volume-title":"ICLR","author":"Yu Shixing","year":"2022","unstructured":"Shixing Yu, Tianlong Chen, Jiayi Shen, Huan Yuan, Jianchao Tan, Sen Yang, Ji Liu, and Zhangyang Wang. 2022. Unified Visual Transformer Compression. In ICLR."},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00071"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623792"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756105","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:43:42Z","timestamp":1769463822000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756105"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":58,"alternative-id":["10.1145\/3725843.3756105","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756105","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}