{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T15:54:30Z","timestamp":1776441270523,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["62302474"],"award-info":[{"award-number":["62302474"]}]},{"name":"the National Science Fund for Excellent Young Scholars","award":["62222212"],"award-info":[{"award-number":["62222212"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681603","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"496-505","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":14,"title":["Dual-path Collaborative Generation Network for Emotional Video Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-4792-3576","authenticated-orcid":false,"given":"Cheng","family":"Ye","sequence":"first","affiliation":[{"name":"School of Information Science and Technology, USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2774-2875","authenticated-orcid":false,"given":"Weidong","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Information Science and Technology, USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9561-7550","authenticated-orcid":false,"given":"Jingyu","family":"Li","sequence":"additional","affiliation":[{"name":"School of Information Science and Technology, USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2839-8693","authenticated-orcid":false,"given":"Lei","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Information Science and Technology, USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5739-8126","authenticated-orcid":false,"given":"Zhendong","family":"Mao","sequence":"additional","affiliation":[{"name":"School of Information Science and Technology, USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2023 MM. Emotion-prior awareness network for emotional video captioning. 589--600."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547761"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3514250","article-title":"Weakly supervised text-based actor-action video segmentation by clip-level multi-instance learning","volume":"19","author":"Chen Weidong","year":"2023","unstructured":"Weidong Chen, Guorong Li, Xinfeng Zhang, Shuhui Wang, Liang Li, and Qingming Huang. 2023. Weakly supervised text-based actor-action video segmentation by clip-level multi-instance learning. ACM Transactions on Multimedia Computing, Communications and Applications 19, 1 (2023), 1--22.","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475534"},{"key":"e_1_3_2_1_8_1","volume-title":"From static to dynamic: Adapting landmark-aware image models for facial expression recognition in videos. arXiv preprint arXiv:2312.05447","author":"Chen Yin","year":"2023","unstructured":"Yin Chen, Jia Li, Shiguang Shan, Meng Wang, and Richang Hong. 2023. From static to dynamic: Adapting landmark-aware image models for facial expression recognition in videos. arXiv preprint arXiv:2312.05447 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Disentangled Cascaded Graph Convolution Networks for Multi-Behavior Recommendation. ACM Transactions on Recommender Systems","author":"Cheng Zhiyong","year":"2024","unstructured":"Zhiyong Cheng, Jianhua Dong, Fan Liu, Lei Zhu, Xun Yang, and Meng Wang. 2024. Disentangled Cascaded Graph Convolution Networks for Multi-Behavior Recommendation. ACM Transactions on Recommender Systems (2024)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02070"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_12_1","first-page":"1","article-title":"Sentimentoriented transformer-based variational autoencoder network for live video commenting","volume":"20","author":"Fu Fengyi","year":"2024","unstructured":"Fengyi Fu, Shancheng Fang,Weidong Chen, and Zhendong Mao. 2024. Sentimentoriented transformer-based variational autoencoder network for live video commenting. ACM Transactions on Multimedia Computing, Communications and Applications 20, 4 (2024), 1--24.","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"key":"e_1_3_2_1_13_1","volume-title":"Benchmarking Micro-action Recognition: Dataset, Method, and Application","author":"Guo Dan","year":"2024","unstructured":"Dan Guo, Kun Li, Bin Hu, Yan Zhang, and Meng Wang. 2024. Benchmarking Micro-action Recognition: Dataset, Method, and Application. IEEE Transactions on Circuits and Systems for Video Technology (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350881"},{"key":"e_1_3_2_1_15_1","unstructured":"Yanbin Hao Diansong Zhou Zhicai Wang Chong-Wah Ngo Xiangnan He and Meng Wang. 2023. PosMLP-Video: Spatial and Temporal Relative Position Encoding for Efficient Video Recognition. (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401063"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-021-11878-w"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01007"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v28i1.8724"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448326"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00620"},{"key":"e_1_3_2_1_24_1","first-page":"1","article-title":"Exploring Visual Relationships via Transformer-based Graphs for Enhanced Image Captioning","volume":"20","author":"Li Jingyu","year":"2024","unstructured":"Jingyu Li, Zhendong Mao, Hao Li, Weidong Chen, and Yongdong Zhang. 2024. Exploring Visual Relationships via Transformer-based Graphs for Enhanced Image Captioning. ACM Transactions on Multimedia Computing, Communications and Applications 20, 5 (2024), 1--23.","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"key":"e_1_3_2_1_25_1","volume-title":"Emotion separation and recognition from a facial expression by generating the poker face with vision transformers. arXiv preprint arXiv:2207.11081","author":"Li Jia","year":"2022","unstructured":"Jia Li, Jiantao Nie, Dan Guo, Richang Hong, and Meng Wang. 2022. Emotion separation and recognition from a facial expression by generating the poker face with vision transformers. arXiv preprint arXiv:2207.11081 (2022)."},{"key":"e_1_3_2_1_26_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings, Part V 13","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part V 13. Springer, 740-- 755."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29826"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2495248"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00561"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_34_1","volume-title":"Contextual attention network for emotional video captioning","author":"Song Peipei","year":"2022","unstructured":"Peipei Song, Dan Guo, Jun Cheng, and Meng Wang. 2022. Contextual attention network for emotional video captioning. IEEE Transactions on Multimedia (2022)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3359045"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2022.3187695"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSS.2023.3256889"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00795"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612245"},{"key":"e_1_3_2_1_41_1","volume-title":"d.]. Emotion expression with fact transfer for video description","author":"Tang Pengjie","unstructured":"HanliWang, Pengjie Tang, Qinyu Li, and Meng Cheng. [n. d.]. Emotion expression with fact transfer for video description. IEEE Transactions on Multimedia ([n. d.])."},{"key":"e_1_3_2_1_42_1","volume-title":"GIT: A Generative Image-to-text Transformer for Vision and Language. arXiv preprint arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. GIT: A Generative Image-to-text Transformer for Vision and Language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00693"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2015.2432791"},{"key":"e_1_3_2_1_45_1","volume-title":"Contour-Augmented Concept Prediction Network for Image Captioning. In International Conference on Artificial Neural Networks. Springer, 180--191","author":"Wang Ting","year":"2023","unstructured":"Ting Wang, Weidong Chen, Jingyu Li, Yixing Peng, and Zhendong Mao. 2023. Contour-Augmented Concept Prediction Network for Image Captioning. In International Conference on Artificial Neural Networks. Springer, 180--191."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.25"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2020.3048414"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2014.2310117"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301363"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-18907-4_29"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16421"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3118983"},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 4237-- 4246","author":"Yang Jingyuan","year":"2021","unstructured":"Jingyuan Yang, Jie Li, Leida Li, Xiumei Wang, and Xinbo Gao. 2021. A circularstructured representation for visual emotion distribution learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 4237-- 4246."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2012.2190083"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2928998"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5364"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3094362"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16465"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681603","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681603","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681603"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":58,"alternative-id":["10.1145\/3664647.3681603","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681603","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}