{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:13:53Z","timestamp":1765008833166,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3771066","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:06:16Z","timestamp":1765008376000},"page":"1-8","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Time-IC: Empowering MLLM with Interleaved Context for Temporal-Sensitive Video Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0970-021X","authenticated-orcid":false,"given":"Henghao","family":"Zhao","sequence":"first","affiliation":[{"name":"Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4102-6506","authenticated-orcid":false,"given":"Peng","family":"Huang","sequence":"additional","affiliation":[{"name":"Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0694-9458","authenticated-orcid":false,"given":"Rui","family":"Yan","sequence":"additional","affiliation":[{"name":"Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5341-5985","authenticated-orcid":false,"given":"Zechao","family":"Li","sequence":"additional","affiliation":[{"name":"Nanjing University of Science and Technology, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"Proceedings of the European Conference on Computer Vision\u00a0(ECCV)","author":"Ataallah Kirolos","year":"2024","unstructured":"Kirolos Ataallah, Xiaoqian Shen, Eslam Abdelrahman, Essam Sleiman, Mingchen Zhuge, Jian Ding, Deyao Zhu, J\u00fcrgen Schmidhuber, and Mohamed Elhoseiny. 2024. Goldfish: Vision-Language Understanding of Arbitrarily Long Videos. In Proceedings of the European Conference on Computer Vision\u00a0(ECCV)."},{"key":"e_1_3_3_1_3_2","volume-title":"arXiv","author":"Chen Houlun","year":"2023","unstructured":"Houlun Chen, Xin Wang, Hong Chen, Zihan Song, Jia Jia, and Wenwu Zhu. 2023. Grounding-Prompter: Prompting LLM with Multimodal Information for Temporal Sentence Grounding in Long Videos. In arXiv."},{"key":"e_1_3_3_1_4_2","volume-title":"arXiv","author":"Cheng Zesen","year":"2024","unstructured":"Zesen Cheng, Sicong Leng, Hang Zhang, Yifei Xin, Xin Li, Guanzheng Chen, Yongxin Zhu, Wenqi Zhang, Ziyang Luo, Deli Zhao, and Lidong Bing. 2024. VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs. In arXiv."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00030"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_3_1_7_2","volume-title":"AAAI Conference on Artificial Intelligence\u00a0(AAAI)","author":"Guo Yongxin","year":"2019","unstructured":"Yongxin Guo, Jingyu Liu, Mingda Li, Dingxin Chen, Xiaoying Tang, Dianbo Sui, Qingbin Liu, Xi Chen, and Kevin Zhao. 2019. VTG-LLM: Integrating Timestamp Knowledge into Video LLMs for Enhanced Video Temporal Grounding. In AAAI Conference on Artificial Intelligence\u00a0(AAAI)."},{"key":"e_1_3_3_1_8_2","volume-title":"Proceedings of the International Conference on Learning Representations\u00a0(ICLR)","author":"Guo Yongxin","year":"2025","unstructured":"Yongxin Guo, Jingyu Liu, Mingda Li, Xiaoying Tang, Qingbin Liu, and Xi Chen. 2025. TRACE: Temporal Grounding Video LLM via Causal Event Modeling. In Proceedings of the International Conference on Learning Representations\u00a0(ICLR)."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01255"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01720"},{"key":"e_1_3_3_1_11_2","volume-title":"International Conference on Learning Representations\u00a0(ICLR)","author":"Hu Edward\u00a0J","year":"2022","unstructured":"Edward\u00a0J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations\u00a0(ICLR)."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"e_1_3_3_1_13_2","volume-title":"Proceedings of the European Conference on Computer Vision\u00a0(ECCV)","author":"Huang De-An","year":"2024","unstructured":"De-An Huang, Shijia Liao, Subhashree Radhakrishnan, Hongxu Yin, Pavlo Molchanov, Zhiding Yu, and Jan Kautz. 2024. LITA: Language Instructed Temporal-Localization Assistant. In Proceedings of the European Conference on Computer Vision\u00a0(ECCV)."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Peng Huang Xiangbo Shu Rui Yan Zhewei Tu and Jinhui Tang. 2025. Appearance-Agnostic Representation Learning for Compositional Action Recognition. IEEE Transactions on Circuits and Systems for Video Technology 35 4 (2025) 3039\u20133053.","DOI":"10.1109\/TCSVT.2024.3384392"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.5244\/C.34.29"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01273"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Xun Jiang Xing Xu Jingran Zhang Fumin Shen Zuo Cao and Heng\u00a0Tao Shen. 2024. SDN: Semantic Decoupling Network for Temporal Language Grounding. IEEE Transactions on Neural Networks and Learning Systems 35 5 (2024) 6598\u20136612.","DOI":"10.1109\/TNNLS.2022.3211850"},{"key":"e_1_3_3_1_18_2","volume-title":"arXiv","author":"Kahatapitiya Kumara","year":"2024","unstructured":"Kumara Kahatapitiya, Kanchana Ranasinghe, Jongwoo Park, and Michael\u00a0S. Ryoo. 2024. Language Repository for Long Video Understanding. In arXiv."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01318"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_3_1_21_2","volume-title":"Advances in Neural Information Processing Systems\u00a0(NeurIPS)","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Tamara\u00a0L Berg, and Mohit Bansal. 2021. Detecting moments and highlights in videos via natural language queries. In Advances in Neural Information Processing Systems\u00a0(NeurIPS)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"e_1_3_3_1_23_2","volume-title":"International Conference on Machine Learning\u00a0(ICML)","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In International Conference on Machine Learning\u00a0(ICML). 13\u00a0pages."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"e_1_3_3_1_25_2","volume-title":"European Conference on Computer Vision\u00a0(ECCV)","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Chengyao Wang, and Jiaya Jia. 2024. LLaMA-VID: An Image is Worth 2 Tokens in Large Language Models. In European Conference on Computer Vision\u00a0(ECCV)."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"e_1_3_3_1_28_2","volume-title":"Advances in Neural Information Processing Systems\u00a0(NeurIPS)","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong\u00a0Jae Lee. 2023. Visual Instruction Tuning. In Advances in Neural Information Processing Systems\u00a0(NeurIPS)."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"e_1_3_3_1_33_2","volume-title":"Proceedings of the International Conference on Machine Learning\u00a0(ICML)","author":"Qian Long","year":"2024","unstructured":"Long Qian, Juncheng Li, Yu Wu, Yaobo Ye, Hao Fei, Tat-Seng Chua, Yueting Zhuang, and Siliang Tang. 2024. Momentor: advancing video large language model with fine-grained temporal reasoning. In Proceedings of the International Conference on Machine Learning\u00a0(ICML)."},{"key":"e_1_3_3_1_34_2","volume-title":"International Conference on Machine Learning\u00a0(ICML)","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning\u00a0(ICML)."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"e_1_3_3_1_37_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition\u00a0(CVPR)","author":"Song Yale","year":"2015","unstructured":"Yale Song, Jordi Vallmitjana, Amanda Stent, and Alejandro Jaimes. 2015. Tvsum: Summarizing web videos using titles. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition\u00a0(CVPR)."},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_51"},{"key":"e_1_3_3_1_39_2","volume-title":"arXiv","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. In arXiv."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00751"},{"key":"e_1_3_3_1_41_2","volume-title":"arXiv","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. GIT: A Generative Image-to-text Transformer for Vision and Language. In arXiv."},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"e_1_3_3_1_44_2","volume-title":"Proceedings of the European Conference on Computer Vision\u00a0(ECCV)","author":"Wang Xiaohan","year":"2024","unstructured":"Xiaohan Wang, Yuhui Zhang, and Serena Yeung-Levy. 2024. VideoAgent: Long-form Video Understanding with Large Language Model as Agent. In Proceedings of the European Conference on Computer Vision\u00a0(ECCV)."},{"key":"e_1_3_3_1_45_2","volume-title":"arXiv","author":"Xu Mingze","year":"2025","unstructured":"Mingze Xu, Mingfei Gao, Shiyu Li, Jiasen Lu, Zhe Gan, Zhengfeng Lai, Meng Cao, Kai Kang, Yinfei Yang, and Afshin Dehghan. 2025. SlowFast-LLaVA-1.5: A Family of Token-Efficient Video Large Language Models for Long-Form Video Understanding. In arXiv."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_3_1_47_2","volume-title":"arXiv","author":"Yang An","year":"2024","unstructured":"An Yang, Baosong Yang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Zhou, Chengpeng Li, Chengyuan Li, Dayiheng Liu, Fei Huang, Guanting Dong, Haoran Wei, Huan Lin, Jialong Tang, Jialin Wang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Ma, Jianxin Yang, Jin Xu, Jingren Zhou, Jinze Bai, Kai Dang, Keming Lu, Keqin Chen, Kexin Yang, Mei Li, Mingfeng Xue, Na Ni, Pei Zhang, Peng Wang, Ru Peng, Rui Men, Ruize Gao, Runji Lin, Shijie Wang, Shuai Bai, Sinan Tan, Tianhang Zhu, Tianhao Li, Tianyu Liu, Wenbin Ge, Xiaodong Deng, Xiaohuan Zhou, Xingzhang Ren, Xipin Wei, Xuancheng Ren, Xuejing Liu, Yang Fan, Yang Yao, Yichang Zhang, Yu Wan, Yunfei Chu, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, Zhifang Guo, and Zhihao Fan. 2024. Qwen2 Technical Report. In arXiv."},{"key":"e_1_3_3_1_48_2","volume-title":"Advances in Neural Information Processing Systems\u00a0(NeurIPS)","author":"Yu Shoubin","year":"2023","unstructured":"Shoubin Yu, Jaemin Cho, Prateek Yadav, and Mohit Bansal. 2023. Self-chained image-language model for video localization and question answering. In Advances in Neural Information Processing Systems\u00a0(NeurIPS)."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1209"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.559"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"crossref","unstructured":"Songyang Zhang Houwen Peng Jianlong Fu Yijuan Lu and Jiebo Luo. 2021. Multi-scale 2d temporal adjacency networks for moment localization with natural language. IEEE Transactions on Pattern Analysis and Machine Intelligence 44 12 (2021) 9073\u20139087.","DOI":"10.1109\/TPAMI.2021.3120745"},{"key":"e_1_3_3_1_52_2","volume-title":"arXiv","author":"Zhao Henghao","year":"2025","unstructured":"Henghao Zhao, Ge-Peng Ji, Rui Yan, Huan Xiong, and Zechao Li. 2025. VideoExpert: Augmented LLM for Temporal-Sensitive Video Understanding. In arXiv."},{"key":"e_1_3_3_1_53_2","unstructured":"Henghao Zhao Kevin\u00a0Qinghong Lin Rui Yan and Zechao Li. 2024. DiffusionVMR: Diffusion Model for Joint Video Moment Retrieval and Highlight Detection. IEEE Transactions on Neural Networks and Learning Systems (2024) 1\u201314."},{"key":"e_1_3_3_1_54_2","volume-title":"Advances in Neural Information Processing Systems\u00a0(NeurIPS)","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric\u00a0P. Xing, Hao Zhang, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In Advances in Neural Information Processing Systems\u00a0(NeurIPS)."},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"e_1_3_3_1_56_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition\u00a0(CVPR)","author":"Zhou Luowei","year":"2018","unstructured":"Luowei Zhou, Yingbo Zhou, Jason Corso, Richard Socher, and Caiming Xiong. 2018. End-to-end dense video cap- tioning with masked transformer. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition\u00a0(CVPR)."},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00911"},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01727"},{"key":"e_1_3_3_1_59_2","volume-title":"Proceedings of the International Conference on Learning Representations\u00a0(ICLR)","author":"Zhu Deyao","year":"2024","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2024. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. In Proceedings of the International Conference on Learning Representations\u00a0(ICLR)."}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3771066","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:09:21Z","timestamp":1765008561000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3771066"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":58,"alternative-id":["10.1145\/3743093.3771066","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3771066","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}