{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T04:06:23Z","timestamp":1779422783976,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T00:00:00Z","timestamp":1779753600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,5,26]]},"DOI":"10.1145\/3786335.3813134","type":"proceedings-article","created":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T03:16:22Z","timestamp":1779419782000},"page":"890-916","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Robust Batch-Level Query Routing for Large Language Models under Cost and Capacity Constraints"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0562-0386","authenticated-orcid":false,"given":"Jelena","family":"Markovic-Voronov","sequence":"first","affiliation":[{"name":"LinkedIn, Sunnyvale, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3482-0421","authenticated-orcid":false,"given":"Kayhan","family":"Behdin","sequence":"additional","affiliation":[{"name":"LinkedIn, Sunnyvale, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9840-7883","authenticated-orcid":false,"given":"Yuanda","family":"Xu","sequence":"additional","affiliation":[{"name":"LinkedIn, Sunnyvale, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5695-4386","authenticated-orcid":false,"given":"Zhengze","family":"Zhou","sequence":"additional","affiliation":[{"name":"LinkedIn, Sunnyvale, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5703-5560","authenticated-orcid":false,"given":"Zhipeng","family":"Wang","sequence":"additional","affiliation":[{"name":"LinkedIn, Sunnyvale, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1384-9743","authenticated-orcid":false,"given":"Rahul","family":"Mazumder","sequence":"additional","affiliation":[{"name":"LinkedIn, Sunnyvale, CA, USA and Massachusetts Institute of Technology, Cambridge, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,5,26]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3617232.3624849"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Aharon Ben-Tal Arkadi Nemirovski and Laurent El\u00a0Ghaoui. 2009. Robust optimization. (2009).","DOI":"10.1515\/9781400831050"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"Tianqi Chen. 2016. XGBoost: A Scalable Tree Boosting System. Cornell University (2016).","DOI":"10.1145\/2939672.2939785"},{"key":"e_1_3_3_1_6_2","unstructured":"Xingyu Chen Jiahao Xu Tian Liang Zhiwei He Jianhui Pang Dian Yu Linfeng Song Qiuzhi Liu Mengfei Zhou Zhuosheng Zhang et\u00a0al. 2024. Do not think that much for 2+ 3=? on the overthinking of o1-like llms. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.21187 (2024)."},{"key":"e_1_3_3_1_7_2","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171\u20134186."},{"key":"e_1_3_3_1_8_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Ding Dujian","unstructured":"Dujian Ding, Ankur Mallick, Chi Wang, Robert Sim, Subhabrata Mukherjee, Victor R\u00fchle, Laks\u00a0VS Lakshmanan, and Ahmed\u00a0Hassan Awadallah. [n. d.]. Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_1_9_2","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et\u00a0al. 2024. The llama 3 herd of models. arXiv e-prints (2024) arXiv\u20132407."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-4380-9_41"},{"key":"e_1_3_3_1_11_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Feng Tao","year":"2025","unstructured":"Tao Feng, Yanzhen Shen, and Jiaxuan You. 2025. GraphRouter: A Graph-based Router for LLM Selections. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=eU39PDsZtT"},{"key":"e_1_3_3_1_12_2","first-page":"1041","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Gunasekaran Jashwant\u00a0Raj","year":"2022","unstructured":"Jashwant\u00a0Raj Gunasekaran, Cyan\u00a0Subhra Mishra, Prashanth Thinakaran, Bikash Sharma, Mahmut\u00a0Taylan Kandemir, and Chita\u00a0R Das. 2022. Cocktail: A multidimensional optimization for model serving in cloud. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 1041\u20131057."},{"key":"e_1_3_3_1_13_2","unstructured":"Tom Gunter Zirui Wang Chong Wang Ruoming Pang Andy Narayanan Aonan Zhang Bowen Zhang Chen Chen Chung-Cheng Chiu David Qiu et\u00a0al. 2024. Apple intelligence foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21075 (2024)."},{"key":"e_1_3_3_1_14_2","volume-title":"The SCIP Optimization Suite 10.0","author":"Hojny Christopher","year":"2025","unstructured":"Christopher Hojny, Mathieu Besan\u00e7on, Ksenia Bestuzheva, Sander Borst, Antonia Chmiela, Jo\u00e3o Dion\u00edsio, Leon Eifler, Mohammed Ghannam, Ambros Gleixner, Adrian G\u00f6\u00df, Alexander Hoen, Rolf van\u00a0der Hulst, Dominik Kamp, Thorsten Koch, Kevin Kofler, Jurgen Lentz, Stephen\u00a0J. Maher, Gioni Mexi, Erik M\u00fchmer, Marc\u00a0E. Pfetsch, Sebastian Pokutta, Felipe Serrano, Yuji Shinano, Mark Turner, Stefan Vigerske, Matthias Walter, Dieter Weninger, and Liding Xu. 2025. The SCIP Optimization Suite 10.0. Technical Report. Optimization Online. https:\/\/optimization-online.org\/2025\/11\/the-scip-optimization-suite-10-0\/"},{"key":"e_1_3_3_1_15_2","volume-title":"Agentic Markets Workshop at ICML 2024","author":"Hu Qitian\u00a0Jason","year":"2024","unstructured":"Qitian\u00a0Jason Hu, Jacob Bieker, Xiuyu Li, Nan Jiang, Benjamin Keigwin, Gaurav Ranganath, Kurt Keutzer, and Shriyash\u00a0Kaustubh Upadhyay. 2024. RouterBench: A Benchmark for Multi-LLM Routing System. In Agentic Markets Workshop at ICML 2024. https:\/\/openreview.net\/forum?id=IVXmV8Uxwh"},{"key":"e_1_3_3_1_16_2","unstructured":"Wittawat Jitkrittum Harikrishna Narasimhan Ankit\u00a0Singh Rawat Jeevesh Juneja Zifeng Wang Chen-Yu Lee Pradeep Shenoy Rina Panigrahy Aditya\u00a0Krishna Menon and Sanjiv Kumar. 2025. Universal model routing for efficient llm inference. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.08773 (2025)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Daniel Kuhn Soroosh Shafiee and Wolfram Wiesemann. 2025. Distributionally robust optimization. Acta Numerica 34 (2025) 579\u2013804.","DOI":"10.1017\/S0962492924000084"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_1_20_2","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et\u00a0al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.19437 (2024)."},{"key":"e_1_3_3_1_21_2","unstructured":"Yifan Lu Rixin Liu Jiayi Yuan Xingqi Cui Shenrun Zhang Hongyi Liu and Jiarong Xing. 2025. RouterArena: An Open Platform for Comprehensive Comparison of LLM Routers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2510.00202 (2025)."},{"key":"e_1_3_3_1_22_2","unstructured":"Martian Learning. 2025. Routing for AI Agents. https:\/\/withmartian.com\/solutions\/routing-for-ai-agents. Accessed: 2026-01-28."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Kai Mei Wujiang Xu Minghao Guo Shuhang Lin and Yongfeng Zhang. 2025. Omnirouter: Budget and performance controllable multi-llm routing. ACM SIGKDD Explorations Newsletter 27 2 (2025) 107\u2013116.","DOI":"10.1145\/3787470.3787480"},{"key":"e_1_3_3_1_24_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Ong Isaac","year":"2025","unstructured":"Isaac Ong, Amjad Almahairi, Vincent Wu, Wei-Lin Chiang, Tianhao Wu, Joseph\u00a0E. Gonzalez, M\u00a0Waleed Kadous, and Ion Stoica. 2025. RouteLLM: Learning to Route LLMs from Preference Data. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=8sSqNntaMr"},{"key":"e_1_3_3_1_25_2","unstructured":"OpenAI. 2025. GPT-5 System Card. https:\/\/openai.com\/index\/gpt-5-system-card\/. Accessed: 2026-01-28."},{"key":"e_1_3_3_1_26_2","unstructured":"Guanzhong Pan Vishal Chodnekar Abinas Roy and Haibo Wang. 2025. A Cost-Benefit Analysis of On-Premise Large Language Model Deployment: Breaking Even with Commercial LLM Services. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.18101 (2025)."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"R\u00a0Tyrrell Rockafellar Stanislav Uryasev et\u00a0al. 2000. Optimization of conditional value-at-risk. Journal of risk 2 (2000) 21\u201342.","DOI":"10.21314\/JOR.2000.038"},{"key":"e_1_3_3_1_29_2","first-page":"606","volume-title":"Proceedings of the 17th ACM International Conference on Web Search and Data Mining","author":"\u0160akota Marija","year":"2024","unstructured":"Marija \u0160akota, Maxime Peyrard, and Robert West. 2024. Fly-swat or cannon? cost-effective language model choice via meta-modeling. In Proceedings of the 17th ACM International Conference on Web Search and Data Mining. 606\u2013615."},{"key":"e_1_3_3_1_30_2","unstructured":"Seamus Somerstep Felipe\u00a0Maia Polo Allysson Flavio\u00a0Melo de Oliveira Prattyush Mangal M\u00edrian Silva Onkar Bhardwaj Mikhail Yurochkin and Subha Maity. 2025. Carrot: A cost aware rate optimal router. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.03261 (2025)."},{"key":"e_1_3_3_1_31_2","unstructured":"Wei Song Zhenya Huang Cheng Cheng Weibo Gao Bihan Xu GuanHao Zhao Fei Wang and Runze Wu. 2025. IRT-Router: Effective and Interpretable Multi-LLM Routing via Item Response Theory. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.01048 (2025)."},{"key":"e_1_3_3_1_32_2","unstructured":"vLLM Semantic Router\u00a0Team. 2025. vLLM Semantic Router. https:\/\/github.com\/vllm-project\/semantic-router."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.5555\/1062391"},{"key":"e_1_3_3_1_34_2","volume-title":"Integer and combinatorial optimization","author":"Wolsey Laurence\u00a0A","year":"1999","unstructured":"Laurence\u00a0A Wolsey and George\u00a0L Nemhauser. 1999. Integer and combinatorial optimization. John Wiley & Sons."},{"key":"e_1_3_3_1_35_2","volume-title":"The Thirty-ninth Annual Conference on Neural Information Processing Systems","author":"Wu Fangzhou","unstructured":"Fangzhou Wu and Sandeep Silwal. [n. d.]. Efficient Training-Free Online Routing for High-Volume Multi-LLM Serving. In The Thirty-ninth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Qi Xin. 2025. Hybrid Cloud Architecture for Efficient and Cost-Effective Large Language Model Deployment. Journal of Information Systems and Informatics 7 3 (2025) 2182\u20132195.","DOI":"10.51519\/journalisi.v7i3.1170"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICC45041.2023.10278962"},{"key":"e_1_3_3_1_38_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Zhuang Richard","unstructured":"Richard Zhuang, Tianhao Wu, Zhaojin Wen, Andrew Li, Jiantao Jiao, and Kannan Ramchandran. [n. d.]. EmbedLLM: Learning Compact Representations of Large Language Models. In The Thirteenth International Conference on Learning Representations."}],"event":{"name":"CAIS '26: ACM Conference on AI and Agentic Systems","location":"San Jose CA USA","acronym":"CAIS '26"},"container-title":["Proceedings of the ACM Conference on AI and Agentic Systems"],"original-title":[],"deposited":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T03:18:59Z","timestamp":1779419939000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3786335.3813134"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,26]]},"references-count":37,"alternative-id":["10.1145\/3786335.3813134","10.1145\/3786335"],"URL":"https:\/\/doi.org\/10.1145\/3786335.3813134","relation":{},"subject":[],"published":{"date-parts":[[2026,5,26]]},"assertion":[{"value":"2026-05-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}