{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:14:00Z","timestamp":1780060440851,"version":"3.54.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2112562"],"award-info":[{"award-number":["2112562"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809203","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"261-278","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["TimelyLLM: Time-sensitive LLM Serving System for Physical-I\/O Limited Agents"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2072-1502","authenticated-orcid":false,"given":"Neiwen","family":"Ling","sequence":"first","affiliation":[{"name":"Yale University, New Haven, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6450-4983","authenticated-orcid":false,"given":"Guojun","family":"Chen","sequence":"additional","affiliation":[{"name":"Yale University, New Haven, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2199-6391","authenticated-orcid":false,"given":"Anurag","family":"Khandelwal","sequence":"additional","affiliation":[{"name":"Yale University, New Haven, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0840-167X","authenticated-orcid":false,"given":"Lin","family":"Zhong","sequence":"additional","affiliation":[{"name":"Yale University, New Haven, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2023. Piper: a fast local neural text-to-speech system. https:\/\/github.com\/rhasspy\/piper."},{"key":"e_1_3_2_1_2_1","unstructured":"2024. Qwen2 technical report. (2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"2024. vLLM: easy fast and cheap LLM serving for everyone."},{"key":"e_1_3_2_1_4_1","volume-title":"Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, et al.","author":"Abdin Marah","year":"2024","unstructured":"Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, et al. 2024. Phi-3 technical report: a highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"Proc. ICML.","author":"Abhyankar Reyna","year":"2025","unstructured":"Reyna Abhyankar, Zijian He, Vikranth Srivatsa, Hao Zhang, and Yiying Zhang. 2025. InferCept: efficient intercept support for augmented large language model inference. In Proc. ICML."},{"key":"e_1_3_2_1_6_1","volume-title":"Proc. USENIX OSDI.","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming throughput-latency tradeoff in LLM inference with Sarathi-Serve. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_7_1","volume-title":"On the landscape of spoken language models: a comprehensive survey. arXiv preprint arXiv:2504.08528","author":"Arora Siddhant","year":"2025","unstructured":"Siddhant Arora, Kai-Wei Chang, Chung-Ming Chien, Yifan Peng, Haibin Wu, Yossi Adi, Emmanuel Dupoux, Hung-Yi Lee, Karen Livescu, and Shinji Watanabe. 2025. On the landscape of spoken language models: a comprehensive survey. arXiv preprint arXiv:2504.08528 (2025)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proc. IEEE Conf. Computer Communications Workshops (INFOCOM WKSHPS).","author":"Barci\u015b Michal","year":"2019","unstructured":"Michal Barci\u015b and Hermann Hellwagner. 2019. An evaluation model for information distribution in multi-robot systems. In Proc. IEEE Conf. Computer Communications Workshops (INFOCOM WKSHPS)."},{"key":"e_1_3_2_1_9_1","volume-title":"Language models are few-shot learners. arXiv preprint arXiv:2005.14165","author":"Brown Tom B","year":"2020","unstructured":"Tom B Brown. 2020. Language models are few-shot learners. arXiv preprint arXiv:2005.14165 (2020)."},{"key":"e_1_3_2_1_10_1","volume-title":"Proc. IEEE\/RSJ IROS.","author":"Carrio Adrian","year":"2018","unstructured":"Adrian Carrio, Sai Vemprala, Andres Ripoll, Srikanth Saripalli, and Pascual Campoy. 2018. Drone detection using depth maps. In Proc. IEEE\/RSJ IROS."},{"key":"e_1_3_2_1_11_1","volume-title":"TypeFly: flying drones with large language model. arXiv preprint arXiv:2312.14950v2","author":"Chen Guojun","year":"2024","unstructured":"Guojun Chen, Xiaojing Yu, Neiwen Ling, and Lin Zhong. 2024. TypeFly: flying drones with large language model. arXiv preprint arXiv:2312.14950v2 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Chatfly: low-latency drone planning with large language models","author":"Chen Guojun","year":"2025","unstructured":"Guojun Chen, Xiaojing Yu, Neiwen Ling, and Lin Zhong. 2025. Chatfly: low-latency drone planning with large language models. IEEE Transactions on Mobile Computing (2025)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF00383389"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Ning Ding Yulin Chen Bokai Xu Yujia Qin Zhi Zheng Shengding Hu Zhiyuan Liu Maosong Sun and Bowen Zhou. 2023. Enhancing chat language models by scaling high-quality instructional conversations. arXiv:2305.14233 [cs.CL]","DOI":"10.18653\/v1\/2023.emnlp-main.183"},{"key":"e_1_3_2_1_15_1","unstructured":"Hugging Face. 2024. Transformers: state-of-the-art machine learning for PyTorch TensorFlow and JAX."},{"key":"e_1_3_2_1_16_1","volume-title":"Efficient LLM scheduling by learning to rank. arXiv preprint arXiv:2408.15792","author":"Fu Yichao","year":"2024","unstructured":"Yichao Fu, Siqi Zhu, Runlong Su, Aurick Qiao, Ion Stoica, and Hao Zhang. 2024. Efficient LLM scheduling by learning to rank. arXiv preprint arXiv:2408.15792 (2024)."},{"key":"e_1_3_2_1_17_1","first-page":"325","article-title":"Prompt cache: modular attention reuse for low-latency inference","volume":"6","author":"Gim In","year":"2024","unstructured":"In Gim, Guojun Chen, Seung-seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. 2024. Prompt cache: modular attention reuse for low-latency inference. Proceedings of Machine Learning and Systems 6 (2024), 325\u2013338.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_18_1","volume-title":"Asynchronous LLM function calling. arXiv preprint arXiv:2412.07017","author":"Gim In","year":"2024","unstructured":"In Gim, Seung-seob Lee, and Lin Zhong. 2024. Asynchronous LLM function calling. arXiv preprint arXiv:2412.07017 (2024)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3731569.3764814"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i21.30570"},{"key":"e_1_3_2_1_21_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. GPT-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-92611-2_4"},{"key":"e_1_3_2_1_23_1","volume-title":"Proc. IEEE\/ACM IPSN.","author":"Jiang Zhehao","year":"2023","unstructured":"Zhehao Jiang, Neiwen Ling, Xuan Huang, Shuyao Shi, Chenhao Wu, Xiaoguang Zhao, Zhenyu Yan, and Guoliang Xing. 2023. CoEdge: a cooperative edge system for distributed real-time deep learning tasks. In Proc. IEEE\/ACM IPSN."},{"key":"e_1_3_2_1_24_1","volume-title":"Proc. NeurIPS.","author":"Jin Yunho","year":"2023","unstructured":"Yunho Jin, Chun-Feng Wu, David Brooks, and Gu-Yeon Wei. 2023. S3: increasing GPU utilization during generative inference for higher throughput. In Proc. NeurIPS."},{"key":"e_1_3_2_1_25_1","volume-title":"An LLM compiler for parallel function calling. arXiv preprint arXiv:2312.04511","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Suhong Moon, Ryan Tabrizi, Nicholas Lee, Michael W Mahoney, Kurt Keutzer, and Amir Gholami. 2023. An LLM compiler for parallel function calling. arXiv preprint arXiv:2312.04511 (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_27_1","volume-title":"Hojun Choi, Steven Y Ko, Sangeun Oh, and Insik Shin.","author":"Lee Sunjae","year":"2024","unstructured":"Sunjae Lee, Junyoung Choi, Jungjae Lee, Munim Hasan Wasi, Hojun Choi, Steven Y Ko, Sangeun Oh, and Insik Shin. 2024. MobileGPT: augmenting LLM with human-like app memory for mobile task automation. (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Andes: defining and enhancing quality-of-experience in LLM-based text streaming services. arXiv preprint arXiv:2404.16283","author":"Liu Jiachen","year":"2024","unstructured":"Jiachen Liu, Jae-Won Chung, Zhiyu Wu, Fan Lai, Myungjin Lee, and Mosharaf Chowdhury. 2024. Andes: defining and enhancing quality-of-experience in LLM-based text streaming services. arXiv preprint arXiv:2404.16283 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Reflect: summarizing robot experiences for failure explanation and correction. arXiv preprint arXiv:2306.15724","author":"Liu Zeyi","year":"2023","unstructured":"Zeyi Liu, Arpit Bahety, and Shuran Song. 2023. Reflect: summarizing robot experiences for failure explanation and correction. arXiv preprint arXiv:2306.15724 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"CMU","author":"Maynard David P","year":"1988","unstructured":"David P Maynard, Samuel E Shipman, Raymond K Clark, J Duane Northcutt, Russell B Kegley, Betsy A Zimmerman, and Peter J Keleher. 1988. An example realtime command, control, and battle management application for alpha. Archons Project TR-88121, CMU (1988)."},{"key":"e_1_3_2_1_31_1","unstructured":"Meta. 2024. The Llama 3 herd of models. https:\/\/ai.meta.com\/research\/publications\/the-llama-3-herd-of-models\/."},{"key":"e_1_3_2_1_32_1","unstructured":"Meta. 2024. Meta AI assistant built with LLaMA 3. https:\/\/about.fb.com\/news\/2024\/04\/meta-ai-assistant-built-with-llama-3\/."},{"key":"e_1_3_2_1_33_1","unstructured":"Neawhen. 2026. TimelyLLM: time-sensitive LLM serving system for physical-I\/O limited agents. https:\/\/github.com\/Neawhen\/TimelyLLM. Accessed: 2026-04-11."},{"key":"e_1_3_2_1_34_1","unstructured":"Neuromeka. 2024. Neuromeka Indy."},{"key":"e_1_3_2_1_35_1","unstructured":"NVIDIA. 2024. TensorRT-LLM. https:\/\/nvidia.github.io\/TensorRT-LLM\/overview.html."},{"key":"e_1_3_2_1_36_1","volume-title":"Proc. ACM ASPLOS.","author":"Oh Hyungjun","year":"2024","unstructured":"Hyungjun Oh, Kihong Kim, Jaemin Kim, Sungkyun Kim, Junyeol Lee, Du-seong Chang, and Jiwon Seo. 2024. ExeGPT: constraint-aware resource scheduling for LLM inference. In Proc. ACM ASPLOS."},{"key":"e_1_3_2_1_37_1","volume-title":"One queue is all you need: resolving head-of-line blocking in large language model serving. arXiv preprint arXiv:2407.00047","author":"Patke Archit","year":"2024","unstructured":"Archit Patke, Dhemath Reddy, Saurabh Jha, Haoran Qiu, Christian Pinto, Shengkun Cui, Chandra Narayanaswami, Zbigniew Kalbarczyk, and Ravishankar Iyer. 2024. One queue is all you need: resolving head-of-line blocking in large language model serving. arXiv preprint arXiv:2407.00047 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Managing delays in human-robot interaction. ACM Transactions on Computer-Human Interaction","author":"Pelikan Hannah","year":"2023","unstructured":"Hannah Pelikan and Emily Hofstetter. 2023. Managing delays in human-robot interaction. ACM Transactions on Computer-Human Interaction (2023)."},{"key":"e_1_3_2_1_39_1","first-page":"606","article-title":"Efficiently scaling transformer inference","volume":"5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems 5 (2023), 606\u2013624.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_40_1","volume-title":"Efficient interactive LLM serving with proxy model-based sequence length prediction. arXiv preprint arXiv:2404.08509","author":"Qiu Haoran","year":"2024","unstructured":"Haoran Qiu, Weichao Mao, Archit Patke, Shengkun Cui, Saurabh Jha, Chen Wang, Hubertus Franke, Zbigniew T Kalbarczyk, Tamer Ba\u015far, and Ravishankar K Iyer. 2024. Efficient interactive LLM serving with proxy model-based sequence length prediction. arXiv preprint arXiv:2404.08509 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"International conference on machine learning. PMLR, 28492\u201328518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492\u201328518."},{"key":"e_1_3_2_1_42_1","volume-title":"Sayplan: grounding large language models using 3D scene graphs for scalable task planning. arXiv preprint arXiv:2307.06135","author":"Rana Krishan","year":"2023","unstructured":"Krishan Rana, Jesse Haviland, Sourav Garg, Jad Abou-Chakra, Ian Reid, and Niko Suenderhauf. 2023. Sayplan: grounding large language models using 3D scene graphs for scalable task planning. arXiv preprint arXiv:2307.06135 (2023)."},{"key":"e_1_3_2_1_43_1","unstructured":"Allen Z Ren Anushri Dixit Alexandra Bodrova Sumeet Singh Stephen Tu Noah Brown Peng Xu Leila Takayama Fei Xia Jake Varley et al. 2023. Robots that ask for help: uncertainty alignment for large language model planners. arXiv preprint arXiv:2307.01928 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Proc. EAI Int. Conf. Performance Evaluation Methodologies and Tools.","author":"Seakhoa-King Shireen","year":"2019","unstructured":"Shireen Seakhoa-King, Paul Balaji, Nicolas Trama Alvarez, and William J Knottenbelt. 2019. Revenue-driven scheduling in drone delivery networks with time-sensitive service level agreements. In Proc. EAI Int. Conf. Performance Evaluation Methodologies and Tools."},{"key":"e_1_3_2_1_45_1","volume-title":"Don't Stop Me Now: embedding based scheduling for LLMs. arXiv preprint arXiv:2410.01035","author":"Shahout Rana","year":"2024","unstructured":"Rana Shahout, Eran Malach, Chunwei Liu, Weifan Jiang, Minlan Yu, and Michael Mitzenmacher. 2024. Don't Stop Me Now: embedding based scheduling for LLMs. arXiv preprint arXiv:2410.01035 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Proc. NeurIPS.","author":"Shen Yongliang","year":"2024","unstructured":"Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu, and Yueting Zhuang. 2024. HuggingGPT: solving AI tasks with ChatGPT and its friends in Hugging Face. In Proc. NeurIPS."},{"key":"e_1_3_2_1_47_1","volume-title":"Response time and display rate in human performance with computers. ACM Computing Surveys (CSUR)","author":"Shneiderman Ben","year":"1984","unstructured":"Ben Shneiderman. 1984. Response time and display rate in human performance with computers. ACM Computing Surveys (CSUR) (1984)."},{"key":"e_1_3_2_1_48_1","volume-title":"Real-time anomaly detection and reactive planning with large language models. arXiv preprint arXiv:2407.08735","author":"Sinha Rohan","year":"2024","unstructured":"Rohan Sinha, Amine Elhafsi, Christopher Agia, Matthew Foutter, Edward Schmerling, and Marco Pavone. 2024. Real-time anomaly detection and reactive planning with large language models. arXiv preprint arXiv:2407.08735 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"Kourosh Darvish, Al\u00e1n Aspuru-Guzik, Florian Shkurti, and Animesh Garg.","author":"Skreta Marta","year":"2023","unstructured":"Marta Skreta, Naruki Yoshikawa, Sebastian Arellano-Rubach, Zhi Ji, Lasse Bj\u00f8rn Kristensen, Kourosh Darvish, Al\u00e1n Aspuru-Guzik, Florian Shkurti, and Animesh Garg. 2023. Errors are useful prompts: instruction guided task programming with verifier-assisted iterative prompting. arXiv preprint arXiv:2303.14100 (2023)."},{"key":"e_1_3_2_1_50_1","unstructured":"Tello. 2023. Tello SDK user guide. https:\/\/dlcdn.ryzerobotics.com\/downloads\/Tello\/Tello%20SDK%202.0%20User%20Guide.pdf"},{"key":"e_1_3_2_1_51_1","unstructured":"Unitree. 2024. Unitree Go2."},{"key":"e_1_3_2_1_52_1","volume-title":"Song-Chun Zhu, and Hangxin Liu.","author":"Wang Shu","year":"2024","unstructured":"Shu Wang, Muzhi Han, Ziyuan Jiao, Zeyu Zhang, Ying Nian Wu, Song-Chun Zhu, and Hangxin Liu. 2024. LLM3: large language model-based task and motion planning with motion failure reasoning. arXiv preprint arXiv:2403.11552 (2024)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3737413"},{"key":"e_1_3_2_1_54_1","volume-title":"Proc. ACM MobiCom.","author":"Wen Hao","year":"2024","unstructured":"Hao Wen, Yuanchun Li, Guohong Liu, Shanhui Zhao, Tao Yu, Toby Jia-Jun Li, Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu. 2024. AutoDroid: LLM-powered task automation in Android. In Proc. ACM MobiCom."},{"key":"e_1_3_2_1_55_1","unstructured":"Wikipedia contributors. 2024. Time-utility function \u2014 Wikipedia. https:\/\/en.wikipedia.org\/wiki\/Time-utility_function"},{"key":"e_1_3_2_1_56_1","volume-title":"Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Yinmin Zhong, Zili Zhang, Gang Huang, Xuanzhe Liu, and Xin Jin. 2023. Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920 (2023)."},{"key":"e_1_3_2_1_57_1","unstructured":"Haibin Wu Yuxuan Hu Ruchao Fan Xiaofei Wang Kenichi Kumatani Bo Ren Jianwei Yu Heng Lu Lijuan Wang Yao Qian et al. 2025. Towards efficient speech-text jointly decoding within one speech language model. arXiv preprint arXiv:2506.04518 (2025)."},{"key":"e_1_3_2_1_58_1","unstructured":"Shunyu Yao Jeffrey Zhao Dian Yu Nan Du Izhak Shafran Karthik R Narasimhan and Yuan Cao. 2022. ReAct: synergizing reasoning and acting in language models. In The eleventh international conference on learning representations."},{"key":"e_1_3_2_1_59_1","volume-title":"Proc. USENIX OSDI.","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: a distributed serving system for transformer-based generative models. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_60_1","volume-title":"Proc. USENIX NSDI.","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: serving DNNs in the wild. In Proc. USENIX NSDI."},{"key":"e_1_3_2_1_61_1","volume-title":"Proc. IEEE ICRA.","author":"Zhang Jiatao","year":"2024","unstructured":"Jiatao Zhang, Lanling Tang, Yufan Song, Qiwei Meng, Haofu Qian, Jun Shao, Wei Song, Shiqiang Zhu, and Jason Gu. 2024. FLTRNN: faithful long-horizon task planning for robotics with large language models. In Proc. IEEE ICRA."},{"key":"e_1_3_2_1_62_1","volume-title":"Bootstrap your own skills: learning to solve new tasks with large language model guidance. arXiv preprint arXiv:2310.10021","author":"Zhang Jesse","year":"2023","unstructured":"Jesse Zhang, Jiahui Zhang, Karl Pertsch, Ziyi Liu, Xiang Ren, Minsuk Chang, Shao-Hua Sun, and Joseph J Lim. 2023. Bootstrap your own skills: learning to solve new tasks with large language model guidance. arXiv preprint arXiv:2310.10021 (2023)."},{"key":"e_1_3_2_1_63_1","volume-title":"Proc. NeurIPS.","author":"Zheng Zangwei","year":"2024","unstructured":"Zangwei Zheng, Xiaozhe Ren, Fuzhao Xue, Yang Luo, Xin Jiang, and Yang You. 2024. Response length perception and sequence scheduling: an LLM-empowered LLM inference pipeline. In Proc. NeurIPS."}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:55:10Z","timestamp":1780059310000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809203"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":63,"alternative-id":["10.1145\/3745756.3809203","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809203","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}