{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T05:50:46Z","timestamp":1776145846792,"version":"3.50.1"},"reference-count":76,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. on Image Process."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tip.2026.3680029","type":"journal-article","created":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T20:03:23Z","timestamp":1775592203000},"page":"3780-3792","source":"Crossref","is-referenced-by-count":0,"title":["SynPO: Synergizing Descriptiveness and Preference Optimization for Video Detailed Captioning"],"prefix":"10.1109","volume":"35","author":[{"given":"Jisheng","family":"Dang","sequence":"first","affiliation":[{"name":"School of Information Science and Engineering, Lanzhou University, Lanzhou, China"}]},{"given":"Yizhou","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Information Science and Engineering, Lanzhou University, Lanzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5336-6810","authenticated-orcid":false,"given":"Hao","family":"Ye","sequence":"additional","affiliation":[{"name":"School of Information Science and Engineering, Lanzhou University, Lanzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2331-3619","authenticated-orcid":false,"given":"Teng","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0952-476X","authenticated-orcid":false,"given":"Yulan","family":"Guo","sequence":"additional","affiliation":[{"name":"School of Electronics and Communication Engineering, Sun Yat-sen University, Shenzhen Campus, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3514-5413","authenticated-orcid":false,"given":"Bin","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Medical Technology, Beijing Institute of Technology, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"VideoLLaMA 2: Advancing spatial\u2013temporal modeling and audio understanding in video-LLMs","author":"Cheng","year":"2024","journal-title":"arXiv:2406.07476"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"ref3","first-page":"19","article-title":"Matryoshka multimodal models","volume-title":"Proc. NeurIPS","author":"Cai"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"ref5","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. ICML","author":"Li"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4321-9"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref8","article-title":"ChatVideo: A tracklet-centric multimodal and versatile video understanding system","author":"Wang","year":"2023","journal-title":"arXiv:2304.14407"},{"key":"ref9","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume-title":"Proc. NeurIPS","author":"Rafailov"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i8.37565"},{"key":"ref11","article-title":"Temporal preference optimization for long-form video understanding","author":"Li","year":"2025","journal-title":"arXiv:2501.13919"},{"key":"ref12","article-title":"Smaug: Fixing failure modes of preference optimisation with DPO-positive","author":"Pal","year":"2024","journal-title":"arXiv:2402.13228"},{"key":"ref13","article-title":"Towards improved preference optimization pipeline: From data generation to budget-controlled regularization","author":"Chen","year":"2024","journal-title":"arXiv:2411.05875"},{"key":"ref14","article-title":"Towards analyzing and understanding the limitations of DPO: A theoretical perspective","author":"Feng","year":"2024","journal-title":"arXiv:2404.04626"},{"key":"ref15","article-title":"AuroraCap: Efficient, performant video detailed captioning and a new benchmark","author":"Chai","year":"2024","journal-title":"arXiv:2410.03051"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32168"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.765"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.775"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ASE56229.2023.00065"},{"key":"ref22","volume-title":"Llava-Next: Improved Reasoning, Ocr, and World Knowledge","author":"Liu","year":"2024"},{"key":"ref23","article-title":"Enhancing the reasoning ability of multimodal large language models via mixed preference optimization","author":"Wang","year":"2024","journal-title":"arXiv:2411.10442"},{"key":"ref24","article-title":"Mistral 7B","author":"Jiang","year":"2023","journal-title":"arXiv:2310.06825"},{"key":"ref25","volume-title":"Llama 3 Model Card","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.337"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1023\/A:1020346032608"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.61"},{"key":"ref29","article-title":"Neural machine translation by jointly learning to align and translate","author":"Bahdanau","year":"2014","journal-title":"arXiv:1409.0473"},{"key":"ref30","article-title":"Translating videos to natural language using deep recurrent neural networks","author":"Venugopalan","year":"2014","journal-title":"arXiv:1412.4729"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479207"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0614"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"ref36","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref37","article-title":"Yi: Open foundation models by 01.AI","author":"AI","year":"2024","journal-title":"arXiv:2403.04652"},{"key":"ref38","article-title":"Baichuan 2: Open large-scale language models","author":"Yang","year":"2023","journal-title":"arXiv:2309.10305"},{"key":"ref39","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. NeurIPS","author":"Ouyang"},{"key":"ref40","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref44","volume-title":"ChatGPT","year":"2023"},{"key":"ref45","first-page":"4299","article-title":"Deep reinforcement learning from human preferences","volume-title":"Proc. NeurIPS","volume":"30","author":"Christiano"},{"key":"ref46","article-title":"A survey of reinforcement learning from human feedback","author":"Kaufmann","year":"2023","journal-title":"arXiv:2312.14925"},{"key":"ref47","first-page":"3008","article-title":"Learning to summarize with human feedback","volume-title":"Proc. NeurIPS","volume":"33","author":"Stiennon"},{"key":"ref48","first-page":"55006","article-title":"LIMA: Less is more for alignment","volume-title":"Proc. NeurIPS","author":"Zhou"},{"key":"ref49","first-page":"10835","article-title":"Scaling laws for reward model overoptimization","volume-title":"Proc. ICML","author":"Gao"},{"key":"ref50","article-title":"Training a helpful and harmless assistant with reinforcement learning from human feedback","author":"Bai","year":"2022","journal-title":"arXiv:2204.05862"},{"key":"ref51","article-title":"DeepSeekMath: Pushing the limits of mathematical reasoning in open language models","author":"Shao","year":"2024","journal-title":"arXiv:2402.03300"},{"key":"ref52","article-title":"DeepSeek-r1: Incentivizing reasoning capability in LLMs via reinforcement learning","author":"Guo","year":"2025","journal-title":"arXiv:2501.12948"},{"key":"ref53","article-title":"Mitigating object hallucinations in large vision-language models through visual contrastive decoding","author":"Leng","year":"2023","journal-title":"arXiv:2311.16922"},{"key":"ref54","article-title":"EventHallusion: Diagnosing event hallucinations in video LLMs","author":"Zhang","year":"2024","journal-title":"arXiv:2409.16597"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32166"},{"key":"ref56","article-title":"Self-refine: Iterative refinement with self-feedback","author":"Madaan","year":"2023","journal-title":"arXiv:2303.17651"},{"key":"ref57","article-title":"Understanding LLM scientific reasoning through promptings and model\u2019s explanation on the answers","author":"Rueda","year":"2025","journal-title":"arXiv:2505.01482"},{"key":"ref58","article-title":"Enhancing mathematical reasoning in large language models with self-consistency-based hallucination detection","author":"Liu","year":"2025","journal-title":"arXiv:2504.09440"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.insights-1.4"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3946"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01265"},{"key":"ref63","article-title":"A general theoretical paradigm to understand learning from human preferences","author":"Gheshlaghi Azar","year":"2023","journal-title":"arXiv:2310.12036"},{"key":"ref64","article-title":"KTO: Model alignment as prospect theoretic optimization","author":"Ethayarajh","year":"2024","journal-title":"arXiv:2402.01306"},{"key":"ref65","article-title":"Contrastive preference optimization: Pushing the boundaries of LLM performance in machine translation","author":"Xu","year":"2024","journal-title":"arXiv:2401.08417"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32474"},{"key":"ref67","article-title":"Video-SALMONN 2: Caption-enhanced audio-visual large language models","author":"Tang","year":"2025","journal-title":"arXiv:2506.15220"},{"key":"ref68","first-page":"23453","article-title":"Bringing RNNs back to efficient open-ended video understanding","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","author":"Xu"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref70","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proc. ACL Workshop Intrinsic Extrinsic Eval. Measures Mach. Transl. Summarization","author":"Banerjee"},{"key":"ref71","article-title":"AlpacaEval: An automatic evaluator of instruction-following models","author":"Li","year":"2023"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2020"},{"key":"ref73","article-title":"Zephyr: Direct distillation of LM alignment","author":"Tunstall","year":"2023","journal-title":"arXiv:2310.16944"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.183"},{"key":"ref75","first-page":"9722","article-title":"UltraFeedback: Boosting language models with high-quality feedback","volume-title":"Proc. ICML","author":"Cui"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.792"}],"container-title":["IEEE Transactions on Image Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/83\/11355710\/11476822.pdf?arnumber=11476822","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T05:05:28Z","timestamp":1776143128000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11476822\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":76,"URL":"https:\/\/doi.org\/10.1109\/tip.2026.3680029","relation":{},"ISSN":["1057-7149","1941-0042"],"issn-type":[{"value":"1057-7149","type":"print"},{"value":"1941-0042","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}