{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T03:08:33Z","timestamp":1762312113644,"version":"build-2065373602"},"reference-count":72,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2025,11]]},"DOI":"10.1007\/s11432-024-4550-8","type":"journal-article","created":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T01:29:50Z","timestamp":1762306190000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Uncertainty-aware large language model response length perception"],"prefix":"10.1007","volume":"68","author":[{"given":"Bin","family":"Shi","sequence":"first","affiliation":[]},{"given":"Bo","family":"Dong","sequence":"additional","affiliation":[]},{"given":"Qinghua","family":"Zheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,3]]},"reference":[{"key":"4550_CR1","unstructured":"Zhao W X, Zhou K, Li J, et al. A survey of large language models. ArXiv:2303.18223"},{"key":"4550_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3641289","volume":"15","author":"Y Chang","year":"2024","unstructured":"Chang Y, Wang X, Wang J, et al. A survey on evaluation of large language models. ACM Trans Intell Syst Tech, 2024, 15: 1\u201345","journal-title":"ACM Trans Intell Syst Tech"},{"key":"4550_CR3","unstructured":"Lyu C, Xu J, Wang L. New trends in machine translation using large language models: case examples with ChatGPT. ArXiv:2305.01181"},{"key":"4550_CR4","first-page":"21841","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"L Zhong","year":"2024","unstructured":"Zhong L, Wang Z. Can LLM replace stack overflow? A study on robustness and reliability of large language model code generation. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2024. 21841\u201321849"},{"key":"4550_CR5","unstructured":"Li N, Gao C, Li Y, et al. Large language model-empowered agents for simulating macroeconomic activities. ArXiv:2310.10436"},{"key":"4550_CR6","unstructured":"Orenstrakh M S, Karnalim O, Suarez C A, et al. Detecting LLM-generated text in computing education: a comparative study for ChatGPT cases. ArXiv:2307.07411"},{"key":"4550_CR7","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/s10916-023-01925-4","volume":"47","author":"M Cascella","year":"2023","unstructured":"Cascella M, Montomoli J, Bellini V, et al. Evaluating the feasibility of ChatGPT in healthcare: an analysis of multiple clinical and research scenarios. J Med Syst, 2023, 47: 33","journal-title":"J Med Syst"},{"key":"4550_CR8","unstructured":"Fei Z, Shen X, Zhu D, et al. Lawbench: benchmarking legal knowledge of large language models. ArXiv:2309.16289"},{"key":"4550_CR9","first-page":"1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"R Y Aminabadi","year":"2022","unstructured":"Aminabadi R Y, Rajbhandari S, Awan A A, et al. Deepspeed-inference: enabling efficient inference of transformer models at unprecedented scale. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, 2022. 1\u201315"},{"key":"4550_CR10","first-page":"613","volume-title":"Proceedings of the 14th USENIX Symposium on Networked Systems Design and Implementation","author":"D Crankshaw","year":"2017","unstructured":"Crankshaw D, Wang X, Zhou G, et al. Clipper: a low-Latency online prediction serving system. In: Proceedings of the 14th USENIX Symposium on Networked Systems Design and Implementation, 2017. 613\u2013627"},{"key":"4550_CR11","unstructured":"Qiu H, Mao W, Patke A, et al. Efficient interactive LLM serving with proxy model-based sequence length prediction. ArXiv:2404.08509"},{"key":"4550_CR12","unstructured":"Wu B, Zhong Y, Zhang Z, et al. Fast distributed inference serving for large language models. ArXiv:2305.05920"},{"key":"4550_CR13","doi-asserted-by":"publisher","first-page":"325","DOI":"10.1145\/3132747.3132780","volume-title":"Proceedings of the 26th Symposium on Operating Systems Principles","author":"G Prekas","year":"2017","unstructured":"Prekas G, Kogias M, Bugnion E. Zygos: achieving low tail latency for microsecond-scale networked tasks. In: Proceedings of the 26th Symposium on Operating Systems Principles, 2017. 325\u2013341"},{"key":"4550_CR14","doi-asserted-by":"publisher","first-page":"103184","DOI":"10.1016\/j.artint.2019.103184","volume":"278","author":"P Gurevich","year":"2020","unstructured":"Gurevich P, Stuke H. Gradient conjugate priors and multi-layer neural networks. Artif Intell, 2020, 278: 103184","journal-title":"Artif Intell"},{"key":"4550_CR15","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford A, Wu J, Child R, et al. Language models are unsupervised multitask learners. OpenAI blog, 2019, 1: 9","journal-title":"OpenAI blog"},{"key":"4550_CR16","first-page":"65517","volume":"36","author":"Z Zheng","year":"2024","unstructured":"Zheng Z, Ren X, Xue F, et al. Response length perception and sequence scheduling: an LLM-empowered LLM inference pipeline. Adv Neural Inf Process Syst, 2024, 36: 65517\u201365530","journal-title":"Adv Neural Inf Process Syst"},{"key":"4550_CR17","unstructured":"Olston C, Fiedel N, Gorovoy K, et al. TensorFlow-serving: flexible, high-performance ML serving. ArXiv:1712.06139"},{"key":"4550_CR18","volume-title":"Stanford Alpaca: an instruction-following LLAMA model","author":"R Taori","year":"2023","unstructured":"Taori R, Gulrajani I, Zhang T, et al. Stanford Alpaca: an instruction-following LLAMA model. 2023. https:\/\/github.com\/tatsu-lab\/stanford_alpaca"},{"key":"4550_CR19","first-page":"770","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"K He","year":"2016","unstructured":"He K, Zhang X, Ren S, et al. Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016. 770\u2013778"},{"key":"4550_CR20","first-page":"24261","volume":"34","author":"I O Tolstikhin","year":"2021","unstructured":"Tolstikhin I O, Houlsby N, Kolesnikov A, et al. MLP-mixer: an all-MLP architecture for vision. Adv Neural Inf Process Syst, 2021, 34: 24261\u201324272","journal-title":"Adv Neural Inf Process Syst"},{"key":"4550_CR21","first-page":"443","volume-title":"Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation","author":"A Gujarati","year":"2020","unstructured":"Gujarati A, Karimi R, Alzayat S, et al. Serving DNNs like clockwork: performance predictability from the bottom up. In: Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation, 2020. 443\u2013462"},{"key":"4550_CR22","unstructured":"Wei J, Tay Y, Bommasani R, et al. Emergent abilities of large language models. ArXiv:2206.07682"},{"key":"4550_CR23","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei J, Wang X, Schuurmans D, et al. Chain-of-thought prompting elicits reasoning in large language models. Adv Neural Inf Process Syst, 2022, 35: 24824\u201324837","journal-title":"Adv Neural Inf Process Syst"},{"key":"4550_CR24","unstructured":"Nye M, Andreassen A J, Gur-Ari G, et al. Show your work: scratchpads for intermediate computation with language models. ArXiv:2112.00114"},{"key":"4550_CR25","unstructured":"Hu E J, Shen Y, Wallis P, et al. Lora: low-rank adaptation of large language models. ArXiv:2106.09685"},{"key":"4550_CR26","unstructured":"Fu Z, Lam W, Yu Q, et al. Decoder-only or encoder-decoder? Interpreting language model as a regularized encoder-decoder. ArXiv:2304.04052"},{"key":"4550_CR27","unstructured":"Zhang X, Li Z, Zhang Y, et al. Language models are universal embedders. ArXiv:2310.08232"},{"key":"4550_CR28","doi-asserted-by":"publisher","first-page":"1841","DOI":"10.1145\/3447548.3467325","volume-title":"Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery & Data Mining","author":"D Wu","year":"2021","unstructured":"Wu D, Gao L, Chinazzi M, et al. Quantifying uncertainty in deep spatiotemporal forecasting. In: Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery & Data Mining, 2021. 1841\u20131851"},{"key":"4550_CR29","doi-asserted-by":"publisher","first-page":"143","DOI":"10.1257\/jep.15.4.143","volume":"15","author":"R Koenker","year":"2001","unstructured":"Koenker R, Hallock K F. Quantile regression. J Econ Perspect, 2001, 15: 143\u2013156","journal-title":"J Econ Perspect"},{"key":"4550_CR30","first-page":"1050","volume-title":"Proceedings of the 33rd International Conference on Machine Learning","author":"Y Gal","year":"2016","unstructured":"Gal Y, Ghahramani Z. Dropout as a Bayesian approximation: representing model uncertainty in deep learning. In: Proceedings of the 33rd International Conference on Machine Learning, 2016. 1050\u20131059"},{"key":"4550_CR31","volume-title":"Advances in Neural Information Processing Systems","author":"M Sensoy","year":"2018","unstructured":"Sensoy M, Kaplan L, Kandemir M. Evidential deep learning to quantify classification uncertainty. In: Advances in Neural Information Processing Systems, 2018"},{"key":"4550_CR32","first-page":"14927","volume":"33","author":"A Amini","year":"2020","unstructured":"Amini A, Schwarting W, Soleimany A, et al. Deep evidential regression. Adv Neural Inf Process Syst, 2020, 33: 14927\u201314937","journal-title":"Adv Neural Inf Process Syst"},{"key":"4550_CR33","doi-asserted-by":"publisher","first-page":"110","DOI":"10.1063\/1.2811677","volume":"41","author":"G Parisi","year":"1988","unstructured":"Parisi G, Shankar R. Statistical field theory. Phys Today, 1988, 41: 110","journal-title":"Phys Today"},{"key":"4550_CR34","volume-title":"Instruction in the wild: a user-based instruction dataset","author":"F Xue","year":"2023","unstructured":"Xue F, Jain K, Shah M H, et al. Instruction in the wild: a user-based instruction dataset. 2023. https:\/\/github.com\/XueFuzhao\/InstructionWild"},{"key":"4550_CR35","unstructured":"Touvron H, Martin L, Stone K, et al. LLAMA 2: open foundation and fine-tuned chat models. ArXiv:2307.09288"},{"key":"4550_CR36","unstructured":"Yang A, Yang B, Zhang B, et al. Qwen2.5 technical report. ArXiv:2412.15115"},{"key":"4550_CR37","first-page":"7895","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"D Oh","year":"2022","unstructured":"Oh D, Shin B. Improving evidential deep learning via multi-task learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2022. 7895\u20137903"},{"key":"4550_CR38","doi-asserted-by":"publisher","first-page":"953","DOI":"10.1111\/jtsa.12426","volume":"39","author":"R Askanazi","year":"2018","unstructured":"Askanazi R, Diebold F X, Schorfheide F, et al. On the comparison of interval forecasts. J Time Series Anal, 2018, 39: 953\u2013965","journal-title":"J Time Series Anal"},{"key":"4550_CR39","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1198\/016214506000001437","volume":"102","author":"T Gneiting","year":"2007","unstructured":"Gneiting T, Raftery A E. Strictly proper scoring rules, prediction, and estimation. J Amer Statist Assoc, 2007, 102: 359\u2013378","journal-title":"J Amer Statist Assoc"},{"key":"4550_CR40","first-page":"1321","volume-title":"Proceedings of the 34th International Conference on Machine Learning","author":"C Guo","year":"2017","unstructured":"Guo C, Pleiss G, Sun Y, et al. On calibration of modern neural networks. In: Proceedings of the 34th International Conference on Machine Learning, 2017. 1321\u20131330"},{"key":"4550_CR41","first-page":"2796","volume-title":"Proceedings of the 35th International Conference on Machine Learning","author":"V Kuleshov","year":"2018","unstructured":"Kuleshov V, Fenner N, Ermon S. Accurate uncertainties for deep learning using calibrated regression. In: Proceedings of the 35th International Conference on Machine Learning, 2018. 2796\u20132804"},{"key":"4550_CR42","volume-title":"Proceedings of the 5th Berkeley Symposium on Mathematical Statistics and Probability","author":"J Macqueen","year":"1967","unstructured":"Macqueen J. Some methods for classification and analysis of multivariate observations. In: Proceedings of the 5th Berkeley Symposium on Mathematical Statistics and Probability, 1967"},{"key":"4550_CR43","first-page":"6889","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Z Ding","year":"2021","unstructured":"Ding Z, Han X, Liu P, et al. Local temperature scaling for probability calibration. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021. 6889\u20136899"},{"key":"4550_CR44","unstructured":"Hoffmann J, Borgeaud S, Mensch A, et al. Training compute-optimal large language models. ArXiv:2203.15556"},{"key":"4550_CR45","first-page":"1","volume":"24","author":"A Chowdhery","year":"2023","unstructured":"Chowdhery A, Narang S, Devlin J, et al. PALM: scaling language modeling with pathways. J Mach Learn Res, 2023, 24: 1\u2013113","journal-title":"J Mach Learn Res"},{"key":"4550_CR46","unstructured":"Gholami A, Azad A, Keutzer K, et al. Integrated model and data parallelism in training neural networks. ArXiv:1712.04432"},{"key":"4550_CR47","doi-asserted-by":"publisher","first-page":"766","DOI":"10.1145\/3605573.3605613","volume-title":"Proceedings of the 52nd International Conference on Parallel Processing","author":"S Li","year":"2023","unstructured":"Li S, Liu H, Bian Z, et al. Colossal-AI: a unified deep learning system for large-scale parallel training. In: Proceedings of the 52nd International Conference on Parallel Processing, 2023. 766\u2013775"},{"key":"4550_CR48","first-page":"103","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"Y Huang","year":"2019","unstructured":"Huang Y, Cheng Y, Bapna A, et al. GPipe: efficient training of giant neural networks using pipeline parallelism. In: Proceedings of the 33rd International Conference on Neural Information Processing Systems, 2019. 103\u2013112"},{"key":"4550_CR49","unstructured":"Frantar E, Ashkboos S, Hoefler T, et al. GptQ: accurate post-training compression for generative pretrained transformers. ArXiv:2210.17323"},{"key":"4550_CR50","unstructured":"Park G, Park B, Kim M, et al. Lut-gemm: quantized matrix multiplication based on LUTs for efficient inference in large-scale generative language models. ArXiv:2206.09557"},{"key":"4550_CR51","unstructured":"Lin J, Tang J, Tang H, et al. AWQ: activation-aware weight quantization for LLM compression and acceleration. ArXiv:2306.00978"},{"key":"4550_CR52","unstructured":"Dettmers T, Lewis M, Belkada Y, et al. LLM. int8 (): 8-bit matrix multiplication for transformers at scale. ArXiv:2208.07339"},{"key":"4550_CR53","first-page":"38087","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"G Xiao","year":"2023","unstructured":"Xiao G, Lin J, Seznec M, et al. Smoothquant: accurate and efficient post-training quantization for large language models. In: Proceedings of the 40th International Conference on Machine Learning, 2023. 38087\u201338099"},{"key":"4550_CR54","unstructured":"Ribar L, Chelombiev I, Hudlass-Galley L, et al. SparQ attention: bandwidth-efficient LLM inference. ArXiv:2312.04985"},{"key":"4550_CR55","unstructured":"Kang H, Zhang Q, Kundu S, et al. Gear: an efficient KV cache compression recipe for near-lossless generative inference of LLM. ArXiv:2403.05527"},{"key":"4550_CR56","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1145\/3600006.3613165","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles","author":"W Kwon","year":"2023","unstructured":"Kwon W, Li Z, Zhuang S, et al. Efficient memory management for large language model serving with paged attention. In: Proceedings of the 29th Symposium on Operating Systems Principles, 2023. 611\u2013626"},{"key":"4550_CR57","first-page":"521","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation","author":"G I Yu","year":"2022","unstructured":"Yu G I, Jeong J S, Kim G W, et al. Orca: a distributed serving system for Transformer-based generative models. In: Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation, 2022. 521\u2013538"},{"key":"4550_CR58","unstructured":"Holmes C, Tanaka M, Wyatt M, et al. DeepSpeed-Fastgen: high-throughput text generation for LLMs via MII and DeepSpeed-inference. ArXiv:2401.08671"},{"key":"4550_CR59","unstructured":"Cheng Z, Kasai J, Yu T. Batch prompting: efficient inference with large language model APIs. ArXiv:2301.08721"},{"key":"4550_CR60","doi-asserted-by":"publisher","first-page":"711","DOI":"10.1038\/s41551-022-00988-x","volume":"7","author":"M Chua","year":"2023","unstructured":"Chua M, Kim D, Choi J, et al. Tackling prediction uncertainty in machine learning for healthcare. Nat Biomed Eng, 2023, 7: 711\u2013718","journal-title":"Nat Biomed Eng"},{"key":"4550_CR61","unstructured":"Michelmore R, Kwiatkowska M, Gal Y. Evaluating uncertainty quantification in end-to-end autonomous driving control. ArXiv:1811.06817"},{"key":"4550_CR62","first-page":"1613","volume-title":"Proceedings of the 32nd International Conference on Machine Learning","author":"C Blundell","year":"2015","unstructured":"Blundell C, Cornebise J, Kavukcuoglu K, et al. Weight uncertainty in neural network. In: Proceedings of the 32nd International Conference on Machine Learning, 2015. 1613\u20131622"},{"key":"4550_CR63","first-page":"4950","volume-title":"Proceedings of the 37th International Conference on Machine Learning","author":"T Joo","year":"2020","unstructured":"Joo T, Chung U, Seo M G. Being Bayesian about categorical probability. In: Proceedings of the 37th International Conference on Machine Learning, 2020. 4950\u20134961"},{"key":"4550_CR64","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"A Malinin","year":"2018","unstructured":"Malinin A, Gales M. Predictive uncertainty estimation via prior networks. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, 2018"},{"key":"4550_CR65","doi-asserted-by":"publisher","first-page":"21","DOI":"10.18653\/v1\/2022.acl-tutorials.4","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Tutorial Abstracts","author":"J Gu","year":"2022","unstructured":"Gu J, Tan X. Non-autoregressive sequence generation. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Tutorial Abstracts, 2022. 21\u201327"},{"key":"4550_CR66","first-page":"3016","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"Z Sun","year":"2019","unstructured":"Sun Z, Li Z, Wang H, et al. Fast structured decoding for sequence models. In: Proceedings of the 33rd International Conference on Neural Information Processing Systems, 2019. 3016\u20133026"},{"key":"4550_CR67","unstructured":"Gu J, Bradbury J, Xiong C, et al. Non-autoregressive neural machine translation. ArXiv:1711.02281"},{"key":"4550_CR68","first-page":"263","volume":"19","author":"P F Brown","year":"1993","unstructured":"Brown P F, Della Pietra S A, Della Pietra V J, et al. The mathematics of statistical machine translation: parameter estimation. Comput Linguist, 1993, 19: 263\u2013311","journal-title":"Comput Linguist"},{"key":"4550_CR69","unstructured":"Devlin J. Bert: Pre-training of deep bidirectional transformers for language understanding. ArXiv:1810.04805"},{"key":"4550_CR70","unstructured":"Ghazvininejad M, Levy O, Liu Y, et al. Mask-predict: parallel decoding of conditional masked language models. ArXiv:1904.09324"},{"key":"4550_CR71","unstructured":"Lee J, Mansimov E, Cho K. Deterministic non-autoregressive neural sequence modeling by iterative refinement. ArXiv:1802.06901"},{"key":"4550_CR72","unstructured":"Lee J, Shu R, Cho K. Iterative refinement in the continuous space for non-autoregressive neural machine translation. ArXiv:2009.07177"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4550-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-024-4550-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4550-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T03:03:32Z","timestamp":1762311812000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-024-4550-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11]]},"references-count":72,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2025,11]]}},"alternative-id":["4550"],"URL":"https:\/\/doi.org\/10.1007\/s11432-024-4550-8","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"type":"print","value":"1674-733X"},{"type":"electronic","value":"1869-1919"}],"subject":[],"published":{"date-parts":[[2025,11]]},"assertion":[{"value":"16 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 February 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 August 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 November 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"210101"}}