{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:09:31Z","timestamp":1765343371987,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. U23A20286, No. 62301121"],"award-info":[{"award-number":["No. U23A20286, No. 62301121"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Postdoctoral Fellowship Program (Grade B) of China Postdoctoral Science Foundation","award":["No.GZB20240120"],"award-info":[{"award-number":["No.GZB20240120"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755167","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"3731-3740","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Unsupervised Ego- and Exo-centric Dense Procedural Activity Captioning via Gaze Consensus Adaptation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6313-8670","authenticated-orcid":false,"given":"Zhaofeng","family":"Shi","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0963-0311","authenticated-orcid":false,"given":"Heqian","family":"Qiu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3745-0262","authenticated-orcid":false,"given":"Lanxiao","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2936-6340","authenticated-orcid":false,"given":"Qingbo","family":"Wu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3016-2567","authenticated-orcid":false,"given":"Fanman","family":"Meng","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7481-095X","authenticated-orcid":false,"given":"Hongliang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Egocentric meets top-view","author":"Ardeshir Shervin","year":"2018","unstructured":"Shervin Ardeshir and Ali Borji. 2018a. Egocentric meets top-view. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 6 (2018), 1353-1366."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2018.05.005"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681214"},{"key":"e_1_3_2_1_4_1","volume-title":"Contrastive Adversarial Training for Unsupervised Domain Adaptation. arXiv preprint arXiv:2407.12782","author":"Chen Jiahong","year":"2024","unstructured":"Jiahong Chen, Zhilin Zhang, Lucy Li, Behzad Shahrasbi, and Arjun Mishra. 2024b. Contrastive Adversarial Training for Unsupervised Domain Adaptation. arXiv preprint arXiv:2407.12782 (2024)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00642"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the European conference on computer vision (ECCV). 720-736","author":"Damen Dima","year":"2018","unstructured":"Dima Damen, Hazel Doughty, Giovanni Maria Farinella, Sanja Fidler, Antonino Furnari, Evangelos Kazakos, Davide Moltisanti, Jonathan Munro, Toby Perrett, Will Price, et al., 2018. Scaling egocentric vision: The epic-kitchens dataset. In Proceedings of the European conference on computer vision (ECCV). 720-736."},{"key":"e_1_3_2_1_7_1","unstructured":"Fernando De la Torre Jessica Hodgins Adam Bargteil Xavier Martin Justin Macey Alex Collado and Pep Beltran. 2009. Guide to the carnegie mellon university multimodal activity (cmu-mmac) database. (2009)."},{"key":"e_1_3_2_1_8_1","volume-title":"Unlocking exocentric video-language data for egocentric video representation learning. arXiv preprint arXiv:2408.03567","author":"Dou Zi-Yi","year":"2024","unstructured":"Zi-Yi Dou, Xitong Yang, Tushar Nagarajan, Huiyu Wang, Jing Huang, Nanyun Peng, Kris Kitani, and Fu-Jen Chu. 2024. Unlocking exocentric video-language data for egocentric video representation learning. arXiv preprint arXiv:2408.03567 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547955"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2022.3141105"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00190"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.503"},{"key":"e_1_3_2_1_13_1","volume-title":"Soda: Story oriented dense video captioning evaluation framework. In Computer Vision-ECCV 2020: 16th European Conference","author":"Fujita Soichiro","year":"2020","unstructured":"Soichiro Fujita, Tsutomu Hirao, Hidetaka Kamigaito, Manabu Okumura, and Masaaki Nagata. 2020. Soda: Story oriented dense video captioning evaluation framework. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part VI 16. Springer, 517-531."},{"key":"e_1_3_2_1_14_1","volume-title":"International conference on machine learning. PMLR, 1180-1189","author":"Ganin Yaroslav","year":"2015","unstructured":"Yaroslav Ganin and Victor Lempitsky. 2015. Unsupervised domain adaptation by backpropagation. In International conference on machine learning. PMLR, 1180-1189."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01834"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3007841"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_46"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02084"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3078882"},{"key":"e_1_3_2_1_21_1","volume-title":"empathy, and mirror neurons. Annual review of psychology","author":"Iacoboni Marco","year":"2009","unstructured":"Marco Iacoboni. 2009. Imitation, empathy, and mirror neurons. Annual review of psychology, Vol. 60, 1 (2009), 653-670."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops. 958-959","author":"Iashin Vladimir","year":"2020","unstructured":"Vladimir Iashin and Esa Rahtu. 2020. Multi-modal dense video captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops. 958-959."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.396"},{"key":"e_1_3_2_1_24_1","volume-title":"European Conference on Computer Vision. Springer, 767-786","author":"Jia Baoxiong","year":"2020","unstructured":"Baoxiong Jia, Yixin Chen, Siyuan Huang, Yixin Zhu, and Song-Chun Zhu. 2020. Lemma: A multi-view dataset for le arning m ulti-agent m ulti-task a ctivities. In European Conference on Computer Vision. Springer, 767-786."},{"key":"e_1_3_2_1_25_1","volume-title":"Bodhisattwa Prasad Majumder, and Manmohan Chandraker","author":"Kalluri Tarun","year":"2024","unstructured":"Tarun Kalluri, Bodhisattwa Prasad Majumder, and Manmohan Chandraker. 2024. Tell, Don't Show!: Language Guidance Eases Transfer Across Domains in Images and Videos. arXiv preprint arXiv:2403.05535 (2024)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/s12369-021-00842-1"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01318"},{"key":"e_1_3_2_1_28_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00998"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01879-7"},{"key":"e_1_3_2_1_32_1","volume-title":"European Conference on Computer Vision. Springer, 192-210","author":"Lai Bolin","year":"2024","unstructured":"Bolin Lai, Fiona Ryan, Wenqi Jia, Miao Liu, and James M Rehg. 2024b. Listen to look into the future: Audio-visual egocentric gaze anticipation. In European Conference on Computer Vision. Springer, 192-210."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.399"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462640"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Yin Li Miao Liu and James M Rehg. 2021a. In the eye of the beholder: Gaze and actions in first person video. IEEE transactions on pattern analysis and machine intelligence Vol. 45 6 (2021) 6731-6747.","DOI":"10.1109\/TPAMI.2021.3051319"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00687"},{"key":"e_1_3_2_1_37_1","volume-title":"European Conference on Computer Vision. Springer, 363-382","author":"Li Yuan-Ming","year":"2024","unstructured":"Yuan-Ming Li, Wei-Jin Huang, An-Lan Wang, Ling-An Zeng, Jing-Ke Meng, and Wei-Shi Zheng. 2024. Egoexo-fitness: Towards egocentric and exocentric full-body action understanding. In European Conference on Computer Vision. Springer, 363-382."},{"key":"e_1_3_2_1_38_1","first-page":"7575","article-title":"Egocentric video-language pretraining","volume":"35","author":"Lin Kevin Qinghong","year":"2022","unstructured":"Kevin Qinghong Lin, Jinpeng Wang, Mattia Soldan, Michael Wray, Rui Yan, Eric Z Xu, Difei Gao, Rong-Cheng Tu, Wenzhe Zhao, Weijie Kong, et al., 2022. Egocentric video-language pretraining. Advances in Neural Information Processing Systems, Vol. 35 (2022), 7575-7586.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053957"},{"key":"e_1_3_2_1_40_1","first-page":"136149","article-title":"Exocentric-to-egocentric video generation","volume":"37","author":"Liu Jia-Wei","year":"2024","unstructured":"Jia-Wei Liu, Weijia Mao, Zhongcong Xu, Jussi Keppo, and Mike Zheng Shou. 2024. Exocentric-to-egocentric video generation. Advances in Neural Information Processing Systems, Vol. 37 (2024), 136149-136172.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_41_1","volume-title":"Intention-driven ego-to-exo video generation. arXiv preprint arXiv:2403.09194","author":"Luo Hongchen","year":"2024","unstructured":"Hongchen Luo, Kai Zhu, Wei Zhai, and Yang Cao. 2024. Intention-driven ego-to-exo video generation. arXiv preprint arXiv:2403.09194 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00675"},{"key":"e_1_3_2_1_43_1","first-page":"29794","article-title":"Shaping embodied agent behavior with activity-context priors from egocentric video","volume":"34","author":"Nagarajan Tushar","year":"2021","unstructured":"Tushar Nagarajan and Kristen Grauman. 2021. Shaping embodied agent behavior with activity-context priors from egocentric video. Advances in Neural Information Processing Systems, Vol. 34 (2021), 29794-29805.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475557"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00807"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"e_1_3_2_1_47_1","volume-title":"EgoMe: Follow Me via Egocentric View in Real World. arXiv preprint arXiv:2501.19061","author":"Qiu Heqian","year":"2025","unstructured":"Heqian Qiu, Zhaofeng Shi, Lanxiao Wang, Huiyu Xiong, Xiang Li, and Hongliang Li. 2025. EgoMe: Follow Me via Egocentric View in Real World. arXiv preprint arXiv:2501.19061 (2025)."},{"key":"e_1_3_2_1_48_1","volume-title":"European Conference on Computer Vision. Springer, 253-270","author":"Quattrocchi Camillo","year":"2024","unstructured":"Camillo Quattrocchi, Antonino Furnari, Daniele Di Mauro, Mario Valerio Giuffrida, and Giovanni Maria Farinella. 2024. Synchronization is all you need: Exocentric-to-egocentric transfer for temporal action segmentation with unlabeled synchronized video pairs. In European Conference on Computer Vision. Springer, 253-270."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00481"},{"key":"e_1_3_2_1_50_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01103"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.neuro.27.070203.144230"},{"key":"e_1_3_2_1_53_1","volume-title":"Iberian Robotics conference. Springer, 155-166","author":"Rocha Bernardo","year":"2023","unstructured":"Bernardo Rocha, Plinio Moreno, and Alexandre Bernardino. 2023. Cross-view generalisation in action recognition: Feature design for transitioning from exocentric to egocentric views. In Iberian Robotics conference. Springer, 155-166."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"e_1_3_2_1_55_1","volume-title":"Cognition Transferring and Decoupling for Text-supervised Egocentric Semantic Segmentation","author":"Shi Zhaofeng","year":"2024","unstructured":"Zhaofeng Shi, Heqian Qiu, Lanxiao Wang, Fanman Meng, Qingbo Wu, and Hongliang Li. 2024. Cognition Transferring and Decoupling for Text-supervised Egocentric Semantic Segmentation. IEEE Transactions on Circuits and Systems for Video Technology (2024)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00772"},{"key":"e_1_3_2_1_57_1","volume-title":"Charades-ego: A large-scale dataset of paired third and first person videos. arXiv preprint arXiv:1804.09626","author":"Sigurdsson Gunnar A","year":"2018","unstructured":"Gunnar A Sigurdsson, Abhinav Gupta, Cordelia Schmid, Ali Farhadi, and Karteek Alahari. 2018b. Charades-ego: A large-scale dataset of paired third and first person videos. arXiv preprint arXiv:1804.09626 (2018)."},{"key":"e_1_3_2_1_58_1","volume-title":"Asian Conference on Computer Vision. Springer, 178-193","author":"Soran Bilge","year":"2014","unstructured":"Bilge Soran, Ali Farhadi, and Linda Shapiro. 2014. Action recognition in the presence of one egocentric and multiple static cameras. In Asian Conference on Computer Vision. Springer, 178-193."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3183112"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00035"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3462244.3479954"},{"key":"e_1_3_2_1_62_1","volume-title":"Exocentric To Egocentric Transfer For Action Recognition: A Short Survey. arXiv preprint arXiv:2410.20621","author":"Thatipelli Anirudh","year":"2024","unstructured":"Anirudh Thatipelli, Shao-Yuan Lo, and Amit K Roy-Chowdhury. 2024. Exocentric To Egocentric Transfer For Action Recognition: A Short Survey. arXiv preprint arXiv:2410.20621 (2024)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128731"},{"key":"e_1_3_2_1_64_1","volume-title":"Deep domain confusion: Maximizing for domain invariance. arXiv preprint arXiv:1412.3474","author":"Tzeng Eric","year":"2014","unstructured":"Eric Tzeng, Judy Hoffman, Ning Zhang, Kate Saenko, and Trevor Darrell. 2014. Deep domain confusion: Maximizing for domain invariance. arXiv preprint arXiv:1412.3474 (2014)."},{"key":"e_1_3_2_1_65_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00751"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3014606"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00806"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01769"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01284"},{"key":"e_1_3_2_1_72_1","first-page":"53688","article-title":"Learning fine-grained view-invariant representations from unpaired ego-exo videos via temporal alignment","volume":"36","author":"Xue Zihui Sherry","year":"2023","unstructured":"Zihui Sherry Xue and Kristen Grauman. 2023. Learning fine-grained view-invariant representations from unpaired ego-exo videos via temporal alignment. Advances in Neural Information Processing Systems, Vol. 36 (2023), 53688-53710.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2018.8451740"},{"key":"e_1_3_2_1_75_1","volume-title":"Joo Hwee Lim, Qi Zhao, and Jiashi Feng.","author":"Zhang Mengmi","year":"2018","unstructured":"Mengmi Zhang, Keng Teck Ma, Joo Hwee Lim, Qi Zhao, and Jiashi Feng. 2018. Anticipating where people will look using adversarial networks. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 8 (2018), 1783-1796."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.377"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3039522"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3517031.3529628"},{"key":"e_1_3_2_1_79_1","volume-title":"Self-explainable affordance learning with embodied caption. arXiv preprint arXiv:2404.05603","author":"Zhang Zhipeng","year":"2024","unstructured":"Zhipeng Zhang, Zhimin Wei, Guolei Sun, Peng Wang, and Luc Van Gool. 2024. Self-explainable affordance learning with embodied caption. arXiv preprint arXiv:2404.05603 (2024)."},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01727"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755167","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:05:09Z","timestamp":1765343109000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755167"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":81,"alternative-id":["10.1145\/3746027.3755167","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755167","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}