{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:40:45Z","timestamp":1755826845157,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":14,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["021714380026"],"award-info":[{"award-number":["021714380026"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Science Foundation of China","award":["62072232"],"award-info":[{"award-number":["62072232"]}]},{"name":"Collaborative Innovation Center of Novel Software Technology and Industrialization"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612863","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"9551-9555","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Deep Video Understanding with Video-Language Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-9993-9576","authenticated-orcid":false,"given":"Runze","family":"Liu","sequence":"first","affiliation":[{"name":"Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9065-968X","authenticated-orcid":false,"given":"Yaqun","family":"Fang","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2789-0325","authenticated-orcid":false,"given":"Fan","family":"Yu","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7840-7159","authenticated-orcid":false,"given":"Ruiqi","family":"Tian","sequence":"additional","affiliation":[{"name":"University of British Columbia, Vancouver, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3092-424X","authenticated-orcid":false,"given":"Tongwei","family":"Ren","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1391-1762","authenticated-orcid":false,"given":"Gangshan","family":"Wu","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372278.3390742"},{"key":"e_1_3_2_2_2_1","volume-title":"MIST: Multi-modal Iterative Spatial-Temporal Transformer for Long-form Video Question Answering. In The IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 14773--14783","author":"Gao Difei","year":"2023","unstructured":"Difei Gao, Luowei Zhou, Lei Ji, Linchao Zhu, Yi Yang, and Mike Zheng Shou. 2023. MIST: Multi-modal Iterative Spatial-Temporal Transformer for Long-form Video Question Answering. In The IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 14773--14783."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_1"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551604"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551610"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01016"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551575"},{"key":"e_1_3_2_2_9_1","volume-title":"Long-form video-language pre-training with multimodal temporal contrastive learning. Advances in neural information processing systems 35","author":"Sun Yuchong","year":"2022","unstructured":"Yuchong Sun, Hongwei Xue, Ruihua Song, Bei Liu, Huan Yang, and Jianlong Fu. 2022. Long-form video-language pre-training with multimodal temporal contrastive learning. Advances in neural information processing systems 35 (2022), 38032--38045."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551578"},{"key":"e_1_3_2_2_11_1","volume-title":"Vlm: Task- agnostic video-language model pre-training for video understanding. arXiv preprint arXiv:2105.09996","author":"Xu Hu","year":"2021","unstructured":"Hu Xu, Gargi Ghosh, Po-Yao Huang, Prahal Arora, Masoumeh Aminzadeh, Christoph Feichtenhofer, Florian Metze, and Luke Zettlemoyer. 2021. Vlm: Task- agnostic video-language model pre-training for video understanding. arXiv preprint arXiv:2105.09996 (2021)."},{"key":"e_1_3_2_2_12_1","volume-title":"Self-Chained Image-Language Model for Video Localization and Question Answering. arXiv preprint arXiv:2305.06988","author":"Yu Shoubin","year":"2023","unstructured":"Shoubin Yu, Jaemin Cho, Prateek Yadav, and Mohit Bansal. 2023. Self-Chained Image-Language Model for Video Localization and Question Answering. arXiv preprint arXiv:2305.06988 (2023)."},{"key":"e_1_3_2_2_13_1","volume-title":"Multimodal Analysis for Deep Video Understanding with Video Language Transformer. In The 30th ACM International Conference on Multimedia. 7165--7169","author":"Zhang Beibei","year":"2022","unstructured":"Beibei Zhang, Yaqun Fang, Tongwei Ren, and Gangshan Wu. 2022. Multimodal Analysis for Deep Video Understanding with Video Language Transformer. In The 30th ACM International Conference on Multimedia. 7165--7169."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Beibei Zhang Fan Yu Yaqun Fang Tongwei Ren and Gangshan Wu. 2021. Hybrid improvements in multimodal analysis for deep video understanding. In ACM Multimedia Asia. 1--5.","DOI":"10.1145\/3469877.3493599"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612863","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612863","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:15:12Z","timestamp":1755821712000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612863"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":14,"alternative-id":["10.1145\/3581783.3612863","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612863","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}