{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T08:03:30Z","timestamp":1776931410513,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T00:00:00Z","timestamp":1760659200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1937592, 2048183, 2434166"],"award-info":[{"award-number":["1937592, 2048183, 2434166"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"The Department of Health and Human Services Advanced Research Projects Agency for Health (ARPA-H)","award":["AY1AX000003, 140D042490003"],"award-info":[{"award-number":["AY1AX000003, 140D042490003"]}]},{"name":"JUMP 2.0, a Semiconductor Research Corporation (SRC) program sponsored by DARPA","award":["CoCoSys"],"award-info":[{"award-number":["CoCoSys"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756039","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"476-489","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ORCHES: Orchestrated Test-Time-Compute-based LLM Reasoning on Collaborative GPU-PIM HEterogeneous System"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9105-9299","authenticated-orcid":false,"given":"Sixu","family":"Li","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9236-0480","authenticated-orcid":false,"given":"Yuzhou","family":"Chen","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4030-9777","authenticated-orcid":false,"given":"Chaojian","family":"Li","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7483-2921","authenticated-orcid":false,"given":"Yonggan","family":"Fu","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9467-7460","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9981-4981","authenticated-orcid":false,"given":"Zhongzhi","family":"Yu","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2873-2153","authenticated-orcid":false,"given":"Haoran","family":"You","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0755-8843","authenticated-orcid":false,"given":"Zhifan","family":"Ye","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9770-3583","authenticated-orcid":false,"given":"Wei","family":"Zhou","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7919-049X","authenticated-orcid":false,"given":"Yongan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5946-203X","authenticated-orcid":false,"given":"Yingyan (Celine)","family":"Lin","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/FPL.2019.00049"},{"key":"e_1_3_3_1_3_2","unstructured":"Jing Bi Junjia Guo Susan Liang Guangyu Sun Luchuan Song Yunlong Tang Jinxi He Jiarui Wu Ali Vosoughi Chen Chen and Chenliang Xu. 2025. VERIFY: A Benchmark of Visual Explanation and Reasoning for Investigating Multimodal Reasoning Fidelity. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.11557 (2025)."},{"key":"e_1_3_3_1_4_2","unstructured":"Charlie Chen Sebastian Borgeaud Geoffrey Irving Jean-Baptiste Lespiau Laurent Sifre and John Jumper. 2023. Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.01318 (2023)."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00084"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Ping Chi Shuangchen Li Cong Xu Tao Zhang Jishen Zhao Yongpan Liu Yu Wang and Yuan Xie. 2016. Prime: A novel processing-in-memory architecture for neural network computation in reram-based main memory. ACM SIGARCH Computer Architecture News 44 3 (2016) 27\u201339.","DOI":"10.1145\/3007787.3001140"},{"key":"e_1_3_3_1_7_2","unstructured":"DeepSeek-AI. 2025. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arxiv:https:\/\/arXiv.org\/abs\/2501.12948\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2501.12948"},{"key":"e_1_3_3_1_8_2","first-page":"13109","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Wei Ji, Hanwang Zhang, Meishan Zhang, Mong\u00a0Li Lee, and Wynne Hsu. 2024. Video-of-thought: step-by-step video reasoning from perception to cognition. In Proceedings of the 41st International Conference on Machine Learning. 13109\u201313125."},{"key":"e_1_3_3_1_9_2","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt. 2021. Measuring Mathematical Problem Solving With the MATH Dataset. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651380"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00086"},{"key":"e_1_3_3_1_12_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Jain Naman","unstructured":"Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, and Ion Stoica. [n. d.]. LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Yoongu Kim Weikun Yang and Onur Mutlu. 2015. Ramulator: A fast and extensible DRAM simulator. IEEE Computer architecture letters 15 1 (2015) 45\u201349.","DOI":"10.1109\/LCA.2015.2414456"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42613.2021.9365862"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00080"},{"key":"e_1_3_3_1_17_2","first-page":"19274","volume-title":"International Conference on Machine Learning","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast inference from transformers via speculative decoding. In International Conference on Machine Learning. PMLR, 19274\u201319286."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00073"},{"key":"e_1_3_3_1_19_2","unstructured":"Runze Liu Junqi Gao Jian Zhao Kaiyan Zhang Xiu Li Biqing Qi Wanli Ouyang and Bowen Zhou. 2025. Can 1B LLM Surpass 405B LLM? Rethinking Compute-Optimal Test-Time Scaling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.06703 (2025)."},{"key":"e_1_3_3_1_20_2","unstructured":"Pan Lu Hritik Bansal Tony Xia Jiacheng Liu Chunyuan Li Hannaneh Hajishirzi Hao Cheng Kai-Wei Chang Michel Galley and Jianfeng Gao. 2023. Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.02255 (2023)."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Haocong Luo Yahya\u00a0Can Tu\u011frul F\u00a0Nisa Bostanc\u0131 Ataberk Olgun A\u00a0Giray Ya\u011fl\u0131k\u00e7\u0131 and Onur Mutlu. 2023. Ramulator 2.0: A modern modular and extensible dram simulator. IEEE Computer Architecture Letters 23 1 (2023) 112\u2013116.","DOI":"10.1109\/LCA.2023.3333759"},{"key":"e_1_3_3_1_22_2","unstructured":"Xinyin Ma Gongfan Fang and Xinchao Wang. 2023. Llm-pruner: On the structural pruning of large language models. Advances in neural information processing systems 36 (2023) 21702\u201321720."},{"key":"e_1_3_3_1_23_2","unstructured":"Xupeng Miao Gabriele Oliaro Zhihao Zhang Xinhao Cheng Zeyu Wang Rae Ying\u00a0Yee Wong Zhuoming Chen Daiyaan Arfeen Reyna Abhyankar and Zhihao Jia. 2023. Specinfer: Accelerating generative llm serving with speculative inference and token tree verification. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.09781 1 2 (2023) 4."},{"key":"e_1_3_3_1_24_2","unstructured":"NVIDIA. [n. d.]. Jetson Orin for Next-Gen Robotics | NVIDIA. https:\/\/www.nvidia.com\/en-us\/autonomous-machines\/embedded-systems\/jetson-orin\/. (Accessed on 04\/02\/2024)."},{"key":"e_1_3_3_1_25_2","unstructured":"OpenAI. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640422"},{"key":"e_1_3_3_1_27_2","unstructured":"scale snu. [n. d.]. Simulator for AttAcc. https:\/\/github.com\/scale-snu\/attacc_simulator. (Accessed on 04\/02\/2024)."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Ali Shafiee Anirban Nag Naveen Muralimanohar Rajeev Balasubramonian John\u00a0Paul Strachan Miao Hu R\u00a0Stanley Williams and Vivek Srikumar. 2016. ISAAC: A convolutional neural network accelerator with in-situ analog arithmetic in crossbars. ACM SIGARCH Computer Architecture News 44 3 (2016) 14\u201326.","DOI":"10.1145\/3007787.3001139"},{"key":"e_1_3_3_1_29_2","unstructured":"Charlie Snell Jaehoon Lee Kelvin Xu and Aviral Kumar. 2024. Scaling llm test-time compute optimally can be more effective than scaling model parameters. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.03314 (2024)."},{"key":"e_1_3_3_1_30_2","first-page":"5878","volume-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)","author":"Sun Zhihong","year":"2024","unstructured":"Zhihong Sun, Chen Lyu, Bolun Li, Yao Wan, Hongyu Zhang, Ge Li, and Zhi Jin. 2024. Enhancing Code Generation Performance of Smaller Models by Distilling the Reasoning Ability of LLMs. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024). 5878\u20135895."},{"key":"e_1_3_3_1_31_2","unstructured":"Ziteng Sun Ananda\u00a0Theertha Suresh Jae\u00a0Hun Ro Ahmad Beirami Himanshu Jain and Felix Yu. 2023. Spectr: Fast speculative decoding via optimal transport. Advances in Neural Information Processing Systems 36 (2023) 30222\u201330242."},{"key":"e_1_3_3_1_32_2","unstructured":"Llama Team. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_1_33_2","unstructured":"Llama Team. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_1_34_2","unstructured":"Llama Team. 2024. The llama 3 herd of models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21783 (2024)."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Yi Wang Weixuan Chen Jing Yang and Tao Li. 2018. Exploiting parallelism for CNN applications on 3D stacked processing-in-memory architecture. IEEE Transactions on Parallel and Distributed Systems 30 3 (2018) 589\u2013600.","DOI":"10.1109\/TPDS.2018.2868062"},{"key":"e_1_3_3_1_36_2","unstructured":"Guangxuan Xiao Yuandong Tian Beidi Chen Song Han and Mike Lewis. 2023. Efficient streaming language models with attention sinks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.17453 (2023)."},{"key":"e_1_3_3_1_37_2","unstructured":"Guowei Xu Peng Jin Li Hao Yibing Song Lichao Sun and Li Yuan. 2024. Llava-o1: Let vision language models reason step-by-step. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.10440 (2024)."},{"key":"e_1_3_3_1_38_2","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei et\u00a0al. 2024. Qwen2. 5 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.15115 (2024)."},{"key":"e_1_3_3_1_39_2","unstructured":"Shang Yang Junxian Guo Haotian Tang Qinghao Hu Guangxuan Xiao Jiaming Tang Yujun Lin Zhijian Liu Yao Lu and Song Han. 2025. Lserve: Efficient long-sequence llm serving with unified sparse attention. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.14866 (2025)."},{"key":"e_1_3_3_1_40_2","unstructured":"Zhihang Yuan Yuzhang Shang Yang Zhou Zhen Dong Zhe Zhou Chenhao Xue Bingzhe Wu Zhikai Li Qingyi Gu Yong\u00a0Jae Lee Yan Yan Beidi Chen Guangyu Sun and Kurt Keutzer. 2024. Llm inference unveiled: Survey and roofline model insights. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.16363 (2024)."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00105"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247710"},{"key":"e_1_3_3_1_43_2","unstructured":"Zixuan Zhou Xuefei Ning Ke Hong Tianyu Fu Jiaming Xu Shiyao Li Yuming Lou Luning Wang Zhihang Yuan Xiuhong Li et\u00a0al. 2024. A survey on efficient inference for large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.14294 (2024)."}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756039","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756039","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:47:11Z","timestamp":1769464031000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756039"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":42,"alternative-id":["10.1145\/3725843.3756039","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756039","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}