{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:50:14Z","timestamp":1777657814593,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":74,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Science and Technology Major Project of China","award":["No.2021ZD0113000"],"award-info":[{"award-number":["No.2021ZD0113000"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681214","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"137-146","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Egocentric Vehicle Dense Video Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-9098-4447","authenticated-orcid":false,"given":"Feiyu","family":"Chen","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, Beijing Normal University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9288-1743","authenticated-orcid":false,"given":"Cong","family":"Xu","sequence":"additional","affiliation":[{"name":"IEIT SYSTEMS Co., Ltd., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0481-4311","authenticated-orcid":false,"given":"Qi","family":"Jia","sequence":"additional","affiliation":[{"name":"IEIT SYSTEMS Co., Ltd., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7470-960X","authenticated-orcid":false,"given":"Yihua","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Beijing Normal University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5391-0217","authenticated-orcid":false,"given":"Yuhan","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Beijing Normal University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5438-5960","authenticated-orcid":false,"given":"Haotian","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Beijing Normal University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8104-6074","authenticated-orcid":false,"given":"Endong","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Beijing Normal University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"A Survey on 3D Egocentric Human Pose Estimation. arXiv preprint arXiv:2403.17893","author":"Azam Md Mushfiqur","year":"2024","unstructured":"Md Mushfiqur Azam and Kevin Desai. 2024. A Survey on 3D Egocentric Human Pose Estimation. arXiv preprint arXiv:2403.17893 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.675"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_6_1","volume-title":"iPerceive: Applying common-sense reasoning to multi-modal dense video captioning and video question answering. arXiv preprint arXiv:2011.07735","author":"Chadha Aman","year":"2020","unstructured":"Aman Chadha, Gurneet Arora, and Navpreet Kaloty. 2020. iPerceive: Applying common-sense reasoning to multi-modal dense video captioning and video question answering. arXiv preprint arXiv:2011.07735 (2020)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00832"},{"key":"e_1_3_2_1_8_1","volume-title":"Can Vision-Language Models Think from a First-Person Perspective? arXiv preprint arXiv:2311.15596","author":"Cheng Sijie","year":"2023","unstructured":"Sijie Cheng, Zhicheng Guo, Jingwen Wu, Kechen Fang, Peng Li, Huaping Liu, and Yang Liu. 2023. Can Vision-Language Models Think from a First-Person Perspective? arXiv preprint arXiv:2311.15596 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the European conference on computer vision (ECCV). 720--736","author":"Damen Dima","year":"2018","unstructured":"Dima Damen, Hazel Doughty, Giovanni Maria Farinella, Sanja Fidler, Antonino Furnari, Evangelos Kazakos, Davide Moltisanti, Jonathan Munro, Toby Perrett, Will Price, et al. 2018. Scaling egocentric vision: The epic-kitchens dataset. In Proceedings of the European conference on computer vision (ECCV). 720--736."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00030"},{"key":"e_1_3_2_1_11_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_12_1","volume-title":"HiLMD: Towards High-Resolution Understanding in Multimodal Large Language Models for Autonomous Driving. arXiv preprint arXiv:2309.05186","author":"Ding Xinpeng","year":"2023","unstructured":"Xinpeng Ding, Jianhua Han, Hang Xu,Wei Zhang, and Xiaomeng Li. 2023. HiLMD: Towards High-Resolution Understanding in Multimodal Large Language Models for Autonomous Driving. arXiv preprint arXiv:2309.05186 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al.","author":"Driess Danny","year":"2023","unstructured":"Danny Driess, Fei Xia, Mehdi SM Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al. 2023. Palm-e: An embodied multimodal language model. arXiv preprint arXiv:2303.03378 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings, Part VI 16","author":"Fujita Soichiro","year":"2020","unstructured":"Soichiro Fujita, Tsutomu Hirao, Hidetaka Kamigaito, Manabu Okumura, and Masaaki Nagata. 2020. SODA: Story oriented dense video captioning evaluation framework. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part VI 16. Springer, 517--531."},{"key":"e_1_3_2_1_15_1","volume-title":"Cees Snoek, Fabian Caba Heilbron, Humam Alwassel, Victor Escorcia, Ranjay Krishna, Shyamal Buch, and Cuong Duc Dao.","author":"Ghanem Bernard","year":"2018","unstructured":"Bernard Ghanem, Juan Carlos Niebles, Cees Snoek, Fabian Caba Heilbron, Humam Alwassel, Victor Escorcia, Ranjay Krishna, Shyamal Buch, and Cuong Duc Dao. 2018. The activitynet large-scale activity recognition challenge 2018 summary. arXiv preprint arXiv:1808.03766 (2018)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00393"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583452"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02194"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Yifei Huang Guo Chen Jilan Xu Mingfang Zhang Lijin Yang Baoqi Pei Hongjie Zhang Lu Dong Yali Wang Limin Wang et al. 2024. EgoExoLearn: A Dataset for Bridging Asynchronous Ego-and Exo-centric View of Procedural Activities in Real World. arXiv preprint arXiv:2403.16182 (2024).","DOI":"10.1109\/CVPR52733.2024.02084"},{"key":"e_1_3_2_1_23_1","volume-title":"A better use of audio-visual cues: Dense video captioning with bi-modal transformer. arXiv preprint arXiv:2005.08271","author":"Iashin Vladimir","year":"2020","unstructured":"Vladimir Iashin and Esa Rahtu. 2020. A better use of audio-visual cues: Dense video captioning with bi-modal transformer. arXiv preprint arXiv:2005.08271 (2020)."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops. 958--959","author":"Iashin Vladimir","year":"2020","unstructured":"Vladimir Iashin and Esa Rahtu. 2020. Multi-modal dense video captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops. 958--959."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.648"},{"key":"e_1_3_2_1_26_1","first-page":"3343","article-title":"Egotaskqa: Understanding human tasks in egocentric videos","volume":"35","author":"Jia Baoxiong","year":"2022","unstructured":"Baoxiong Jia, Ting Lei, Song-Chun Zhu, and Siyuan Huang. 2022. Egotaskqa: Understanding human tasks in egocentric videos. Advances in Neural Information Processing Systems 35 (2022), 3343--3360.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160326"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.320"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_35"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00782"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6815"},{"key":"e_1_3_2_1_33_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01007"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02034"},{"key":"e_1_3_2_1_36_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00011"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00110"},{"key":"e_1_3_2_1_39_1","volume-title":"Gpt-driver: Learning to drive with gpt. arXiv preprint arXiv:2310.01415","author":"Mao Jiageng","year":"2023","unstructured":"Jiageng Mao, Yuxi Qian, Hang Zhao, and Yue Wang. 2023. Gpt-driver: Learning to drive with gpt. arXiv preprint arXiv:2310.01415 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"A language agent for autonomous driving. arXiv preprint arXiv:2311.10813","author":"Mao Jiageng","year":"2023","unstructured":"Jiageng Mao, Junjie Ye, Yuxi Qian, Marco Pavone, and YueWang. 2023. A language agent for autonomous driving. arXiv preprint arXiv:2311.10813 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00675"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01007"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01249"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_45_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00900"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00734"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1146\/annurev-control-060117-105157"},{"key":"e_1_3_2_1_49_1","volume-title":"Lmdrive: Closed-loop end-to-end driving with large language models. arXiv preprint arXiv:2312.07488","author":"Shao Hao","year":"2023","unstructured":"Hao Shao, Yuxuan Hu, LetianWang, Steven LWaslander, Yu Liu, and Hongsheng Li. 2023. Lmdrive: Closed-loop end-to-end driving with large language models. arXiv preprint arXiv:2312.07488 (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1641"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00751"},{"key":"e_1_3_2_1_53_1","volume-title":"Temporal segment networks for action recognition in videos","author":"Wang Limin","year":"2018","unstructured":"Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, and Luc Van Gool. 2018. Temporal segment networks for action recognition in videos. IEEE transactions on pattern analysis and machine intelligence 41, 11 (2018), 2740--2755."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Tai Wang Xiaohan Mao Chenming Zhu Runsen Xu Ruiyuan Lyu Peisen Li Xiao Chen Wenwei Zhang Kai Chen Tianfan Xue et al. 2023. EmbodiedScan: A Holistic Multi-Modal 3D Perception Suite Towards Embodied AI. arXiv preprint arXiv:2312.16170 (2023).","DOI":"10.1109\/CVPR52733.2024.01868"},{"key":"e_1_3_2_1_55_1","volume-title":"Learning grounded vision-language representation for versatile understanding in untrimmed videos. arXiv preprint arXiv:2303.06378","author":"Wang Teng","year":"2023","unstructured":"Teng Wang, Jinrui Zhang, Feng Zheng, Wenhao Jiang, Ran Cheng, and Ping Luo. 2023. Learning grounded vision-language representation for versatile understanding in untrimmed videos. arXiv preprint arXiv:2303.06378 (2023)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3014606"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1517"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01854"},{"key":"e_1_3_2_1_60_1","volume-title":"Driving into the future: Multiview visual forecasting and planning with world model for autonomous driving. arXiv preprint arXiv:2311.17918","author":"Wang Yuqi","year":"2023","unstructured":"Yuqi Wang, Jiawei He, Lue Fan, Hongxin Li, Yuntao Chen, and Zhaoxiang Zhang. 2023. Driving into the future: Multiview visual forecasting and planning with world model for autonomous driving. arXiv preprint arXiv:2311.17918 (2023)."},{"key":"e_1_3_2_1_61_1","volume-title":"Policy pre-training for autonomous driving via self-supervised geometric modeling. arXiv preprint arXiv:2301.01006","author":"Wu Penghao","year":"2023","unstructured":"Penghao Wu, Li Chen, Hongyang Li, Xiaosong Jia, Junchi Yan, and Yu Qiao. 2023. Policy pre-training for autonomous driving via self-supervised geometric modeling. arXiv preprint arXiv:2301.01006 (2023)."},{"key":"e_1_3_2_1_62_1","volume-title":"Retrieval-augmented egocentric video captioning. arXiv preprint arXiv:2401.00789","author":"Xu Jilan","year":"2024","unstructured":"Jilan Xu, Yifei Huang, Junlin Hou, Guo Chen, Yuejie Zhang, Rui Feng, and Weidi Xie. 2024. Retrieval-augmented egocentric video captioning. arXiv preprint arXiv:2401.00789 (2024)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00486"},{"key":"e_1_3_2_1_64_1","volume-title":"Zhenguo Li, and Hengshuang Zhao.","author":"Xu Zhenhua","year":"2023","unstructured":"Zhenhua Xu, Yujia Zhang, Enze Xie, Zhen Zhao, Yong Guo, Kenneth KY Wong, Zhenguo Li, and Hengshuang Zhao. 2023. Drivegpt4: Interpretable end-to-end autonomous driving via large language model. arXiv preprint arXiv:2310.01412 (2023)."},{"key":"e_1_3_2_1_65_1","volume-title":"HOI-Swap: Swapping Objects in Videos with Hand-Object Interaction Awareness. arXiv preprint arXiv:2406.07754","author":"Xue Zihui","year":"2024","unstructured":"Zihui Xue, Mi Luo, Changan Chen, and Kristen Grauman. 2024. HOI-Swap: Swapping Objects in Videos with Hand-Object Interaction Awareness. arXiv preprint arXiv:2406.07754 (2024)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"e_1_3_2_1_68_1","volume-title":"RAG-Driver: Generalisable Driving Explanations with Retrieval-Augmented In-Context Learning in Multi-Modal Large Language Model. arXiv preprint arXiv:2402.10828","author":"Yuan Jianhao","year":"2024","unstructured":"Jianhao Yuan, Shuyang Sun, Daniel Omeiza, Bo Zhao, Paul Newman, Lars Kunze, and Matthew Gadd. 2024. RAG-Driver: Generalisable Driving Explanations with Retrieval-Augmented In-Context Learning in Multi-Modal Large Language Model. arXiv preprint arXiv:2402.10828 (2024)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_21"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/3DV57658.2022.00077"},{"key":"e_1_3_2_1_71_1","volume-title":"Towards Learning a Generalist Model for Embodied Navigation. arXiv preprint arXiv:2312.02010","author":"Zheng Duo","year":"2023","unstructured":"Duo Zheng, Shijia Huang, Lin Zhao, Yiwu Zhong, and Liwei Wang. 2023. Towards Learning a Generalist Model for Embodied Navigation. arXiv preprint arXiv:2312.02010 (2023)."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00911"},{"key":"e_1_3_2_1_74_1","volume-title":"Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020. Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681214","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681214","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681214"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":74,"alternative-id":["10.1145\/3664647.3681214","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681214","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}