{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T16:19:04Z","timestamp":1781021944732,"version":"3.54.1"},"reference-count":63,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. on Mobile Comput."],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1109\/tmc.2025.3626724","type":"journal-article","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T17:14:13Z","timestamp":1761930853000},"page":"4762-4775","source":"Crossref","is-referenced-by-count":1,"title":["Task-Oriented Feature Compression for Multimodal Understanding via Device-Edge Co-Inference"],"prefix":"10.1109","volume":"25","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5086-3066","authenticated-orcid":false,"given":"Cheng","family":"Yuan","sequence":"first","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI) of China Telecom, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6502-368X","authenticated-orcid":false,"given":"Zhening","family":"Liu","sequence":"additional","affiliation":[{"name":"Department of Electronic and Computer Engineering, Hong Kong University of Science and Technology, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4157-3108","authenticated-orcid":false,"given":"Jiashu","family":"Lv","sequence":"additional","affiliation":[{"name":"School of Software and Microelectronics, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8836-1430","authenticated-orcid":false,"given":"Jiawei","family":"Shao","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI) of China Telecom, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0523-8117","authenticated-orcid":false,"given":"Yufei","family":"Jiang","sequence":"additional","affiliation":[{"name":"School of Electronic and Information Engineering, Harbin Institute of Technology, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5222-1898","authenticated-orcid":false,"given":"Jun","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Electronic and Computer Engineering, Hong Kong University of Science and Technology, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2924-946X","authenticated-orcid":false,"given":"Xuelong","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI) of China Telecom, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.807"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.738"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2025.103198"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/890"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref7","article-title":"LLaVA-OneVision: Easy visual task transfer","author":"Li","year":"2025","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref8","article-title":"Qwen2.5-VL technical report","author":"Bai","year":"2025"},{"key":"ref9","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","volume":"139","author":"Radford","year":"2021"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/MWC.002.2200468"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/mnet.2025.3541208"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2022.3223408"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1016\/j.comnet.2021.107930"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/MCOM.001.2000373"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TWC.2022.3191118"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2023.3288252"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP58920.2024.10734780"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/103085.103089"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/79.952804"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-10-0072-0_35"},{"key":"ref22","article-title":"End-to-end optimized image compression","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ball\u00e9","year":"2017"},{"key":"ref23","article-title":"Variational image compression with a scale hyperprior","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ball\u00e9","year":"2018"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00796"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00563"},{"key":"ref26","first-page":"22857","article-title":"LLaVA-PruMerge: Adaptive token reduction for efficient large multimodal models","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","author":"Shang","year":"2025"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01843"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-025-02491-7"},{"key":"ref29","article-title":"Towards semantic equivalence of tokenization in multimodal LLM","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wu","year":"2025"},{"key":"ref30","first-page":"Singapore","article-title":"Inference optimal VLMs need only one visual token but larger models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Li","year":"2025"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73004-7_2"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref33","article-title":"The information bottleneck method","author":"Tishby","year":"2000"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCWorkshops49005.2020.9145068"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2021.3126087"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TWC.2023.3314888"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/GLOBECOM54140.2023.10436784"},{"key":"ref38","article-title":"BPG image format","author":"Bellard","year":"2014"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/79.952802"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2020.3034501"},{"key":"ref41","first-page":"10771","article-title":"Joint autoregressive and hierarchical priors for learned image compression","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Minnen","year":"2018"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP40778.2020.9190935"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01453"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_27"},{"key":"ref45","article-title":"FocusLLaVA: A coarse-to-fine approach for efficient and effective visual token compression","author":"Zhu","year":"2024"},{"key":"ref46","article-title":"Multi-stage vision token dropping: Towards efficient multimodal large language model","author":"Liu","year":"2024"},{"key":"ref47","article-title":"LLaVA-Mini: Efficient image and video large multimodal models with one vision token","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhang","year":"2025"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.865"},{"key":"ref49","article-title":"Unified language-vision pretraining in LLM with dynamic discrete visual tokenization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yang","year":"2024"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2016.02.001"},{"key":"ref51","first-page":"6309","article-title":"Neural discrete representation learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Oord","year":"2017"},{"key":"ref52","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hu","year":"2022"},{"issue":"120","key":"ref53","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref54","article-title":"Deepseek-v3 technical report","author":"Liu","year":"2024"},{"key":"ref55","article-title":"CompressAI: A PyTorch library and evaluation platform for end-to-end compression research","author":"B\u00e9gaint","year":"2020"},{"key":"ref56","article-title":"MME: A comprehensive evaluation benchmark for multimodal large language models","volume-title":"Proc. NeurIPS","author":"Fu","year":"2025"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0850"},{"key":"ref60","first-page":"28671","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume-title":"Proc. Neural Inf. Process. Syst.","author":"Lu","year":"2022"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3685520"},{"key":"ref63","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","author":"Li","year":"2023"}],"container-title":["IEEE Transactions on Mobile Computing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/7755\/11423852\/11222951.pdf?arnumber=11222951","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T06:48:30Z","timestamp":1772866110000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11222951\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":63,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tmc.2025.3626724","relation":{},"ISSN":["1536-1233","1558-0660","2161-9875"],"issn-type":[{"value":"1536-1233","type":"print"},{"value":"1558-0660","type":"electronic"},{"value":"2161-9875","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4]]}}}