{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T08:36:59Z","timestamp":1769503019490,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T00:00:00Z","timestamp":1760659200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2141064"],"award-info":[{"award-number":["2141064"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3762817","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:21:19Z","timestamp":1760721679000},"page":"201-216","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["StreamTensor: Make Tensors Stream in Dataflow Accelerators for LLMs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6646-8146","authenticated-orcid":false,"given":"Hanchen","family":"Ye","sequence":"first","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, Illinois, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3016-0270","authenticated-orcid":false,"given":"Deming","family":"Chen","sequence":"additional","affiliation":[{"name":"Inspirit IoT, Inc., Champaign, Illinois, USA and University of Illinois Urbana-Champaign, Urbana, Illinois, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895630"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3508352.3549424"},{"key":"e_1_3_3_1_4_2","unstructured":"Meta AI. 2024. Llama 3.2 1B Model Card. https:\/\/huggingface.co\/meta-llama\/Llama-3.2-1B"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330701"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/2228360.2228584"},{"key":"e_1_3_3_1_8_2","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et\u00a0al. 2023. Qwen technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.16609 (2023)."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706628.3708878"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356173"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Bishnupriya Bhattacharya and Shuvra\u00a0S Bhattacharyya. 2001. Parameterized dataflow modeling for DSP systems. IEEE Transactions on Signal Processing 49 10 (2001) 2408\u20132421.","DOI":"10.1109\/78.950795"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Greet Bilsen Marc Engels Rudy Lauwereins and Jean Peperstraete. 1996. Cycle-static dataflow. IEEE Transactions on signal processing 44 2 (1996) 397\u2013408.","DOI":"10.1109\/78.485935"},{"key":"e_1_3_3_1_13_2","unstructured":"Jeffrey Burns Leland Chang AI Hardware AI Scaling Kim Martineau and AI Generative. 2022. Meet the ibm artificial intelligence unit. IBM Research.[Online]. Available: https:\/\/research. ibm. com\/blog\/ibm-artificial-intelligence-unit-aiu (2022)."},{"key":"e_1_3_3_1_14_2","unstructured":"Deming Chen Jason Cong Yiping Fan Guoling Han Wei Jiang and Zhiru Zhang. 2005. xpilot: A platform-based behavioral synthesis system. SRC TechCon 5 (2005) 54."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Hongzheng Chen Jiahao Zhang Yixiao Du Shaojie Xiang Zichao Yue Niansong Zhang Yaohui Cai and Zhiru Zhang. 2024. Understanding the potential of fpga-based spatial acceleration for large language model inference. ACM Transactions on Reconfigurable Technology and Systems 18 1 (2024) 1\u201329.","DOI":"10.1145\/3656177"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Hongzheng Chen Niansong Zhang Shaojie Xiang Zhichen Zeng Mengjia Dai and Zhiru Zhang. 2024. Allo: A Programming Model for Composable Accelerator Design. Proceedings of the ACM on Programming Languages 8 PLDI (2024) 593\u2013620.","DOI":"10.1145\/3656401"},{"key":"e_1_3_3_1_17_2","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et\u00a0al. 2018. { TVM} : An automated { End-to-End} optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578\u2013594."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Yu-Hsin Chen Tushar Krishna Joel\u00a0S Emer and Vivienne Sze. 2016. Eyeriss: An energy-efficient reconfigurable accelerator for deep convolutional neural networks. IEEE journal of solid-state circuits 52 1 (2016) 127\u2013138.","DOI":"10.1109\/JSSC.2016.2616357"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/2966986.2967077"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3240850"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Jack Choquette. 2023. Nvidia hopper h100 gpu: Scaling performance. IEEE Micro 43 3 (2023) 9\u201317.","DOI":"10.1109\/MM.2023.3256796"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/2554688.2554771"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3543622.3573188"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3385412.3385983"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3289602.3293906"},{"key":"e_1_3_3_1_26_2","unstructured":"Google. 2025. Gemma 3 1B IT Model Card. https:\/\/huggingface.co\/google\/gemma-3-1b-it"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Ramaswamy Govindarajan Guang\u00a0R Gao and Palash Desai. 2002. Minimizing buffer requirements under rate-optimal schedule in regular dataflow networks. Journal of VLSI signal processing systems for signal image and video technology 31 3 (2002) 207\u2013229.","DOI":"10.1023\/A:1015452903532"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3431920.3439289"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582018"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00051"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Alexandre Honorat Micka\u00ebl Dardaillon Hugo Miomandre and Jean-Fran\u00e7ois Nezan. 2024. Automated Buffer Sizing of Dataflow Applications in a High-Level Synthesis Workflow. ACM Transactions on Reconfigurable Technology and Systems 17 1 (2024) 1\u201326.","DOI":"10.1145\/3626103"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3174243.3174264"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Lana Josipovi\u0107 Shabnam Sheikhha Andrea Guerrieri Paolo Ienne and Jordi Cortadella. 2021. Buffer placement and sizing for high-performance dataflow circuits. ACM Transactions on Reconfigurable Technology and Systems (TRETS) 15 1 (2021) 1\u201332.","DOI":"10.1145\/3477053"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"David Koeplinger Christina Delimitrou Raghu Prabhakar Christos Kozyrakis Yaqi Zhang and Kunle Olukotun. 2016. Automatic generation of efficient accelerators for reconfigurable hardware. ACM SIGARCH Computer Architecture News 44 3 (2016) 115\u2013127.","DOI":"10.1145\/3007787.3001150"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3192366.3192379"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Edward\u00a0A Lee and David\u00a0G Messerschmitt. 1987. Synchronous data flow. Proc. IEEE 75 9 (1987) 1235\u20131245.","DOI":"10.1109\/PROC.1987.13876"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD.2013.6691122"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/MEMCOD.2004.1459852"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080255"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Tony Nowatzki Michael Sartin-Tarm Lorenzo De\u00a0Carli Karthikeyan Sankaralingam Cristian Estan and Behnam Robatmili. 2013. A general constraint-centric scheduling framework for spatial architectures. ACM SIGPLAN Notices 48 6 (2013) 495\u2013506.","DOI":"10.1145\/2499370.2462163"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","unstructured":"OpenAI. 2019. GPT-2 Model Card. 10.57967\/hf\/0039","DOI":"10.57967\/hf\/0039"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"crossref","unstructured":"Raghu Prabhakar Ram Sivaramakrishnan Darshan Gandhi Yun Du Mingran Wang Xiangyu Song Kejie Zhang Tianren Gao Angela Wang Karen Li et\u00a0al. 2024. SambaNova SN40L: Scaling the AI Memory Wall with Dataflow and Composition of Experts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.07518 (2024).","DOI":"10.1109\/MICRO61859.2024.00100"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Raghu Prabhakar Yaqi Zhang David Koeplinger Matt Feldman Tian Zhao Stefan Hadjis Ardavan Pedram Christos Kozyrakis and Kunle Olukotun. 2017. Plasticine: A reconfigurable architecture for parallel paterns. ACM SIGARCH Computer Architecture News 45 2 (2017) 389\u2013402.","DOI":"10.1145\/3140659.3080256"},{"key":"e_1_3_3_1_46_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"Jonathan Ragan-Kelley Connelly Barnes Andrew Adams Sylvain Paris Fr\u00e9do Durand and Saman Amarasinghe. 2013. Halide: a language and compiler for optimizing parallelism locality and recomputation in image processing pipelines. Acm Sigplan Notices 48 6 (2013) 519\u2013530.","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_3_1_48_2","unstructured":"Alexander Rucker Shiv Sundram Coleman Smith Matthew Vilim Raghu Prabhakar Fredrik Kjolstad and Kunle Olukotun. 2023. Revet: A Language and Compiler for Dataflow Threads. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.06124 (2023)."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480047"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071015"},{"key":"e_1_3_3_1_51_2","unstructured":"Gemma Team Thomas Mesnard Cassidy Hardin Robert Dadashi Surya Bhupatiraju Shreya Pathak Laurent Sifre Morgane Rivi\u00e8re Mihir\u00a0Sanjay Kale Juliette Love et\u00a0al. 2024. Gemma: Open models based on gemini research and technology. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.08295 (2024)."},{"key":"e_1_3_3_1_52_2","unstructured":"Qwen Team. 2024. Qwen2.5-0.5B Model Card. https:\/\/huggingface.co\/Qwen\/Qwen2.5-0.5B"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-45937-5_14"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378495"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_3_1_56_2","unstructured":"Torch-MLIR Contributors. 2021. Torch-MLIR: A compiler for the PyTorch ecosystem. https:\/\/github.com\/llvm\/torch-mlir. Accessed: 2025-06-20."},{"key":"e_1_3_3_1_57_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/1233501.1233650"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00032"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637570"},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00060"},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3617232.3624850"},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00017"},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3240801"},{"key":"e_1_3_3_1_65_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00085"},{"key":"e_1_3_3_1_66_2","doi-asserted-by":"publisher","DOI":"10.1109\/FPL57034.2022.00044"},{"key":"e_1_3_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575694"},{"key":"e_1_3_3_1_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3543622.3573210"},{"key":"e_1_3_3_1_69_2","doi-asserted-by":"crossref","unstructured":"Jinming Zhuang Jason Lau Hanchen Ye Zhuoping Yang Shixin Ji Jack Lo Kristof Denolf Stephen Neuendorffer Alex Jones Jingtong Hu et\u00a0al. 2024. CHARM 2.0: Composing Heterogeneous Accelerators for Deep Learning on Versal ACAP Architecture. ACM Transactions on Reconfigurable Technology and Systems 17 3 (2024) 1\u201331.","DOI":"10.1145\/3686163"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3762817","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3762817","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:48:32Z","timestamp":1769464112000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3762817"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":68,"alternative-id":["10.1145\/3725843.3762817","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3762817","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}