{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T20:57:34Z","timestamp":1775509054886,"version":"3.50.1"},"reference-count":239,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Singapore Ministry of Education Tertiary Education Research Fund","award":["MOE2024-TRF-015"],"award-info":[{"award-number":["MOE2024-TRF-015"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Netw. Sci. Eng."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tnse.2026.3668404","type":"journal-article","created":{"date-parts":[[2026,3,2]],"date-time":"2026-03-02T20:57:00Z","timestamp":1772485020000},"page":"7667-7683","source":"Crossref","is-referenced-by-count":0,"title":["Constraint-Driven Evolution of Multimodal Video Intelligence: A Network and System Perspective"],"prefix":"10.1109","volume":"13","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3262-3049","authenticated-orcid":false,"given":"Xuzhao","family":"Li","sequence":"first","affiliation":[{"name":"Nanyang Technological University, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2565-8857","authenticated-orcid":false,"given":"Xuchen","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shiyu","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Physical and Mathematical Sciences, Nanyang Technological University, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhaorui","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computing, The Hong Kong Polytechnic University, Hung Hom, Kowloon, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4475-5451","authenticated-orcid":false,"given":"Kang Hao","family":"Cheong","sequence":"additional","affiliation":[{"name":"School of Physical and Mathematical Sciences, Nanyang Technological University, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Towards AI-assisted sustainable adaptive video streaming systems: Tutorial and survey","author":"Farahani","year":"2024"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.3934\/mbe.2023686"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3614419.3644023"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3074319"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3444693"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02059"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/tgrs.2025.3526725"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-021-02166-7"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108282"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3389024"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3607827.3616847"},{"key":"ref12","first-page":"19850","article-title":"ATCTrack: Aligning target-context cues with dynamic target states for robust vision-language tracking","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","author":"Feng","year":"2025"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0476"},{"key":"ref14","article-title":"Online and real-time tracking in a surveillance scenario","author":"Urbann","year":"2021"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3657282"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1016\/j.iot.2023.100690"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01727"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-024-10489-4"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3712059"},{"key":"ref21","first-page":"13109","article-title":"Video-of-thought: Step-by-step video reasoning from perception to cognition","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","volume":"235","author":"Fei","year":"2024"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2939"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.1096"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72670-5_5"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3566695"},{"key":"ref26","article-title":"Foundation models for video understanding: A survey","author":"Madan","year":"2024"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.217"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.432"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00945"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3557053"},{"key":"ref31","article-title":"Underwater camouflaged object tracking meets vision-language SAM2","author":"Zhang","year":"2024"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3531375"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28205"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3733473"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888064"},{"key":"ref36","article-title":"ATSTrack: Enhancing visual-language tracking by aligning temporal and spatial scales","author":"Zhen","year":"2025"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72691-0_27"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-025-07472-8"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2023.02.023"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02217"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1016\/j.mlwa.2024.100588"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2025.3553290"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2024.3492263"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICME57554.2024.10687451"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680657"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2025.103604"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2025.110787"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611803"},{"key":"ref49","article-title":"ReasoningTrack: Chain-of-thought reasoning for long-term vision-language tracking","author":"Wang","year":"2025"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1241"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00724"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01785"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00160"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/183"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00816"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3557570"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2025.113731"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/RICAI60863.2023.10489325"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00915"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2024.3520103"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3510735"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3726529"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3285441"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2024.3393230"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-96-4558-9"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610544"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00540"},{"key":"ref68","first-page":"25007","article-title":"A multi-modal global instance tracking benchmark (mgit): Better locating target in complex spatio-temporal and causal relationship","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Hu","year":"2023"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00552"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01387-y"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01288"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3301933"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01817"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01805"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888064"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32223"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.777"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00552"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1587"},{"key":"ref81","article-title":"Visual language tracking with multi-modal interaction: A robust benchmark","author":"Li","year":"2024"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/cvprw63382.2024.00724"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4157"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01318"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00900"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00707"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3058626"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.339"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00782"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00675"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.494"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3399933"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00911"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25412"},{"key":"ref97","first-page":"5651","article-title":"End-to-end dense video captioning as sequence generation","volume-title":"Proc. 29th Int. Conf. Comput. Linguistics","author":"Zhu","year":"2022"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2022.103204"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00487"},{"key":"ref100","article-title":"SAVCHOI: Detecting suspicious activities using dense video captioning with human object interactions","author":"Mittal","year":"2022"},{"key":"ref101","article-title":"Semantic-aware pretraining for dense video captioning","author":"Wang","year":"2022"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2021.11.017"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1007\/s40747-023-00998-5"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.156"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2024.104385"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6881"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00751"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.aacl-main.48"},{"key":"ref109","article-title":"Team RUC_AIM3 technical report at activitynet 2020 task 2: Exploring sequential events detection for dense video captioning","author":"Song","year":"2020"},{"key":"ref110","article-title":"Zero-shot dense video captioning by jointly optimizing text and moment","author":"Jo","year":"2023"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01284"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01769"},{"key":"ref114","article-title":"PLLaVA : Parameter-free LLaVA extension from images to videos for video dense captioning","author":"Xu","year":"2024"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73414-4_26"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2024.04.052"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.inlg-1.42"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3283067"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/VCIP53242.2021.9675405"},{"key":"ref121","first-page":"122154","article-title":"AutoTimes: Autoregressive time series forecasters via large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"37","author":"Liu","year":"2024"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00300"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20160"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651359"},{"key":"ref126","first-page":"545","article-title":"Accelerating the training of large language models using efficient activation rematerialization and optimal hybrid parallelism","volume-title":"Proc. 2024 USENIX Annu. Tech. Conf.","author":"Yuan","year":"2024"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i38.40448"},{"key":"ref128","article-title":"Large language models for planning: A comprehensive and systematic survey","author":"Cao","year":"2025"},{"key":"ref129","article-title":"Prefill-decode aggregation or disaggregation? Unifying both for goodput-optimized LLM serving","author":"Wang","year":"2025"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/584"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2019.00272"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.01011"},{"key":"ref133","first-page":"23634","article-title":"MERLOT: Multimodal neural script knowledge models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zellers","year":"2021"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00498"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01265"},{"key":"ref136","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","volume-title":"Proc. 49th Annu. Meeting Assoc. Comput. Linguistics: Hum. Lang. Technol.","author":"Chen","year":"2011"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"ref143","article-title":"Tarsier: Recipes for training and evaluating large video description models","author":"Wang","year":"2024"},{"key":"ref144","article-title":"Auroracap: Efficient, performant video detailed captioning and a new benchmark","author":"Chai","year":"2024"},{"key":"ref145","article-title":"Fiova: A multi-annotator benchmark for human-aligned video captioning","author":"Hu","year":"2024"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref147","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proc. Acl Workshop Intrinsic Extrinsic Eval. Measures Mach. Transl. And\/Or Summarization","author":"Banerjee","year":"2005"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2015.2409733"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1017\/S0305000900009168"},{"key":"ref152","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","volume-title":"Proc. Annu. Meeting Assoc. Comput. Linguistics","author":"Chen","year":"2011"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"ref155","first-page":"7590","article-title":"Towards automatic learning of procedures from web instructional videos","volume-title":"Proc. AAAI Conf. Artif. Intell.","author":"Zhou","year":"2018"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2019.00272"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"ref158","article-title":"VideoRFT: Incentivizing video reasoning capability in MLLMs via reinforced fine-tuning","author":"Wang","year":"2025"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72995-9_11"},{"key":"ref160","article-title":"Embrace-3 K: Embodied reasoning and action in complex environments","author":"Lin","year":"2025"},{"key":"ref161","article-title":"Reinforcement learning tuning for videoLLMs: Reward design and data efficiency","author":"Li","year":"2025"},{"key":"ref162","article-title":"ThinkAct: Vision-language-action reasoning via reinforced visual latent planning","author":"Huang","year":"2025"},{"key":"ref163","article-title":"SiLVR: A simple language-based video reasoning framework","author":"Zhang","year":"2025"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i8.37582"},{"key":"ref165","article-title":"TinyLLaVA-Video-R1: Towards smaller LMMs for video reasoning","author":"Zhang","year":"2025"},{"key":"ref166","article-title":"Scaling RL to long videos","author":"Chen","year":"2025"},{"key":"ref167","article-title":"Visionary-R1: Mitigating shortcuts in visual reasoning with reinforcement learning","author":"Xia","year":"2025"},{"key":"ref168","article-title":"Echoink-R1: Exploring audio-visual reasoning in multimodal LLMs via reinforcement learning","author":"Xing","year":"2025"},{"key":"ref169","article-title":"Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities","author":"Comanici","year":"2025"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1145\/3690624.3709168"},{"key":"ref171","article-title":"Momentor: Advancing video large language model with fine-grained temporal reasoning","author":"Qian","year":"2024"},{"key":"ref172","article-title":"Reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing","author":"Wu","year":"2025"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-emnlp.394"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-emnlp.1001"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i9.37655"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00311"},{"key":"ref177","article-title":"Spacer: Reinforcing MLLMs in video spatial reasoning","author":"Ouyang","year":"2025"},{"key":"ref178","article-title":"Video-R1: Reinforcing video reasoning in MLLMs","author":"Feng","year":"2025"},{"key":"ref179","article-title":"VideoChat-R1: Enhancing spatio-temporal perception via reinforcement fine-tuning","author":"Li","year":"2025"},{"key":"ref180","article-title":"Thinking with videos: Multimodal tool-augmented reinforcement learning for long video reasoning","author":"Zhang","year":"2025"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i12.37937"},{"key":"ref182","article-title":"Scene-R1: Video-grounded large language models for 3D scene reasoning without 3D annotations","author":"Yuan","year":"2025"},{"key":"ref183","article-title":"Omni-R1: Reinforcement learning for omnimodal reasoning via two-system collaboration","author":"Zhong","year":"2025"},{"key":"ref184","article-title":"Reinforcing video reasoning with focused thinking","author":"Dang","year":"2025"},{"key":"ref185","article-title":"DeepVideo-R1: Video reinforcement fine-tuning via difficulty-aware regressive grpo","author":"Park","year":"2025"},{"key":"ref186","article-title":"SophiaVL-R1: Reinforcing MLLMs reasoning with thinking reward","author":"Fan","year":"2025"},{"key":"ref187","article-title":"Pixel reasoner: Incentivizing pixel-space reasoning with curiosity-driven reinforcement learning","author":"Su","year":"2025"},{"key":"ref188","article-title":"Look less, reason more: Rollout-guided adaptive pixel-space reasoning","author":"Li","year":"2025"},{"key":"ref189","article-title":"Select less, reason more: Prioritizing evidence purity for video reasoning","author":"Li","year":"2025"},{"key":"ref190","article-title":"VersaVid-R1: A versatile video understanding and reasoning model from question answering to captioning tasks","author":"Chen","year":"2025"},{"key":"ref191","article-title":"M2-Reasoning: Empowering MLLMs with unified general and spatial reasoning","author":"AI","year":"2025"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.3390\/smartcities5010019"},{"key":"ref193","article-title":"Cognitive architectures for language agents","author":"Sumers","year":"2024","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2022.102784"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2021.3104254"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01388"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3165153"},{"key":"ref198","article-title":"Videowebarena: Evaluating long context multimodal agents with video understanding web tasks","author":"Jang","year":"2024"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1145\/3748302"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.12709\/mest.13.13.02.06"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.121502"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.150"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2758"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.517"},{"key":"ref206","article-title":"Video-mmmu: Evaluating knowledge acquisition from multi-discipline professional videos","author":"Hu","year":"2025"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00793"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02245"},{"key":"ref209","article-title":"VCR-Bench: A comprehensive evaluation framework for video chain-of-thought reasoning","author":"Qi","year":"2025"},{"key":"ref210","article-title":"Video-holmes: Can MLLM think like holmes for complex video reasoning?","author":"Cheng","year":"2025"},{"key":"ref211","article-title":"MMR-V: What\u2019s left unsaid? A benchmark for multimodal deep reasoning in videos","author":"Zhu","year":"2025"},{"key":"ref212","first-page":"28828","article-title":"LongVideoBench: A benchmark for long-context interleaved video-language understanding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"37","author":"Wu","year":"2024"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3205207"},{"key":"ref214","article-title":"MME-reasoning: A comprehensive benchmark for logical reasoning in MLLMs","author":"Yuan","year":"2025"},{"key":"ref215","article-title":"STAR: A benchmark for situated reasoning in real-world videos","author":"Wu","year":"2024"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01099"},{"key":"ref217","article-title":"VideoMathQA: Benchmarking mathematical reasoning via multimodal understanding in videos","author":"Rasheed","year":"2025"},{"key":"ref218","article-title":"VideoVista: A versatile benchmark for video understanding and reasoning","author":"Li","year":"2024"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00975"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01271"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02438"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1016\/j.robot.2021.103911"},{"key":"ref223","article-title":"Knowledge graph-based system for technical document retrieval: A deductive reasoning-focused exploration","author":"Sesbou","year":"2024"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1609\/hcomp.v10i1.21997"},{"key":"ref225","article-title":"Cosmos-reason1: From physical common sense to embodied reasoning","author":"Azzolini","year":"2025"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-023-02036-y"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1145\/3544549.3585786"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.3390\/mti9010006"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1145\/3587259.3627561"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3195643"},{"key":"ref231","doi-asserted-by":"crossref","first-page":"1103","DOI":"10.1007\/s00778-021-00726-w","article-title":"Data distribution debugging in machine learning pipelines","volume":"31","author":"Grafberger","year":"2022","journal-title":"VLDB J."},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.63282\/3050-9246.ijetcsit-v6i2p107"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1109\/TBC.2022.3176193"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2024.103811"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02718"},{"key":"ref236","article-title":"Modeling key narrative elements for story understanding and generation","author":"Brahman","year":"2022"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109906"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/LICS52264.2021.9470608"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.15"}],"container-title":["IEEE Transactions on Network Science and Engineering"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6488902\/11264281\/11417444.pdf?arnumber=11417444","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T19:58:15Z","timestamp":1775505495000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11417444\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":239,"URL":"https:\/\/doi.org\/10.1109\/tnse.2026.3668404","relation":{},"ISSN":["2327-4697","2334-329X"],"issn-type":[{"value":"2327-4697","type":"electronic"},{"value":"2334-329X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}