{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T03:48:45Z","timestamp":1776138525732,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681189","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"2292-2300","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["<i>Aspects are Anchors:<\/i>\n            Towards Multimodal Aspect-based Sentiment Analysis via Aspect-driven Alignment and Refinement"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-5840-2114","authenticated-orcid":false,"given":"Zhanpeng","family":"Chen","sequence":"first","affiliation":[{"name":"Peking University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4530-5516","authenticated-orcid":false,"given":"Zhihong","family":"Zhu","sequence":"additional","affiliation":[{"name":"Peking University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3371-0137","authenticated-orcid":false,"given":"Wanshi","family":"Xu","sequence":"additional","affiliation":[{"name":"Peking University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1306-5746","authenticated-orcid":false,"given":"Yunyan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Jarvis Research Center, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1118-9710","authenticated-orcid":false,"given":"Xian","family":"Wu","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Jarvis Research Center, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2195-2847","authenticated-orcid":false,"given":"Yefeng","family":"Zheng","sequence":"additional","affiliation":[{"name":"Westlake University, Medical Artificial Intelligence Lab &amp; Tencent YouTu Lab, Jarvis Research Center, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"David Alvarez-Melis Tommi S. Jaakkola and Stefanie Jegelka. 2017. Structured Optimal Transport. arxiv: 1712.06199 [stat.ML]"},{"key":"e_1_3_2_1_2_1","volume-title":"CWI: A multimodal deep learning approach for named entity recognition from social media using character, word and image features. Neural Computing and Applications","author":"Asgari-Chenaghlu Meysam","year":"2021","unstructured":"Meysam Asgari-Chenaghlu, M. Reza Feizi-Derakhshi, Leili Farzinvash, M. A. Balafar, and Cina Motamed. 2021. CWI: A multimodal deep learning approach for named entity recognition from social media using character, word and image features. Neural Computing and Applications (2021)."},{"key":"e_1_3_2_1_3_1","volume-title":"Otkge: Multi-modal knowledge graph embeddings via optimal transport. NeurIPS","author":"Cao Zongsheng","year":"2022","unstructured":"Zongsheng Cao, Qianqian Xu, Zhiyong Yang, Yuan He, Xiaochun Cao, and Qingming Huang. 2022. Otkge: Multi-modal knowledge graph embeddings via optimal transport. NeurIPS (2022)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Guimin Chen Yuanhe Tian and Yan Song. 2020. Joint aspect extraction and sentiment analysis with directional graph convolutional networks. In COLING.","DOI":"10.18653\/v1\/2020.coling-main.24"},{"key":"e_1_3_2_1_5_1","unstructured":"Tao Chen Damian Borth Trevor Darrell and Shih-Fu Chang. 2014. DeepSentiBank: Visual Sentiment Concept Classification with Deep Convolutional Neural Networks."},{"key":"e_1_3_2_1_6_1","volume-title":"Sinkhorn distances: Lightspeed computation of optimal transport. NeurIPS","author":"Cuturi Marco","year":"2013","unstructured":"Marco Cuturi. 2013. Sinkhorn distances: Lightspeed computation of optimal transport. NeurIPS (2013)."},{"key":"e_1_3_2_1_7_1","series-title":"SIAM Journal on Imaging Sciences","volume-title":"Regularized Discrete Optimal Transport","author":"Ferradans Sira","year":"2014","unstructured":"Sira Ferradans, Nicolas Papadakis, Gabriel Peyr\u00e9, and Jean-Fran\u00e7ois Aujol. 2014. Regularized Discrete Optimal Transport. SIAM Journal on Imaging Sciences (2014)."},{"key":"e_1_3_2_1_8_1","unstructured":"Xavier Glorot and Yoshua Bengio. 2010. Understanding the difficulty of training deep feedforward neural networks. In AISTATS."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"e_1_3_2_1_10_1","volume-title":"Open-domain targeted sentiment analysis via span-based extraction and classification. arXiv preprint arXiv:1906.03820","author":"Hu Minghao","year":"2019","unstructured":"Minghao Hu, Yuxing Peng, Zhen Huang, Dongsheng Li, and Yiwei Lv. 2019. Open-domain targeted sentiment analysis via span-based extraction and classification. arXiv preprint arXiv:1906.03820 (2019)."},{"key":"e_1_3_2_1_11_1","volume-title":"Learning with noisy correspondence for cross-modal matching. NeurIPS","author":"Huang Zhenyu","year":"2021","unstructured":"Zhenyu Huang, Guocheng Niu, Xiao Liu, Wenbiao Ding, Xinyan Xiao, Hua Wu, and Xi Peng. 2021. Learning with noisy correspondence for cross-modal matching. NeurIPS (2021)."},{"key":"e_1_3_2_1_12_1","unstructured":"Xincheng Ju Dong Zhang Rong Xiao Junhui Li Shoushan Li Min Zhang and Guodong Zhou. 2021. Joint multi-modal aspect-sentiment analysis with auxiliary cross-modal relation detection. In EMNLP."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Zaid Khan and Yun Fu. 2021. Exploiting BERT for multimodal target sentiment classification through input space translation. In ACM MM.","DOI":"10.1145\/3474085.3475692"},{"key":"e_1_3_2_1_14_1","volume-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461","author":"Lewis Mike","year":"2019","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)."},{"key":"e_1_3_2_1_15_1","volume-title":"Contextual Distillation Model for Diversified Recommendation. arXiv","author":"Li Fan","year":"2024","unstructured":"Fan Li, Xu Si, Shisong Tang, Dingmin Wang, Kunyan Han, Bing Han, Guorui Zhou, Yang Song, and Hechang Chen. 2024. Contextual Distillation Model for Diversified Recommendation. arXiv (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"A Cognitive Brain Model for Multimodal Sentiment Analysis Based on Attention Neural Networks. Neurocomputing","author":"Li Yuanqing","year":"2021","unstructured":"Yuanqing Li, Ke Zhang, Jingyu Wang, and Xinbo Gao. 2021. A Cognitive Brain Model for Multimodal Sentiment Analysis Based on Attention Neural Networks. Neurocomputing (2021)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Yan Ling Jianfei Yu and Rui Xia. 2022. Vision-Language Pre-Training for Multimodal Aspect-Based Sentiment Analysis. In ACL.","DOI":"10.18653\/v1\/2022.acl-long.152"},{"key":"e_1_3_2_1_18_1","volume-title":"Aligning visual regions and textual concepts for semantic-grounded image representations. NeurIPS","author":"Liu Fenglin","year":"2019","unstructured":"Fenglin Liu, Yuanxin Liu, Xuancheng Ren, Xiaodong He, and Xu Sun. 2019. Aligning visual regions and textual concepts for semantic-grounded image representations. NeurIPS (2019)."},{"key":"e_1_3_2_1_19_1","volume-title":"Learning to Align Sequential Actions in the Wild. CoRR","author":"Liu Weizhe","year":"2021","unstructured":"Weizhe Liu, Bugra Tekin, Huseyin Coskun, Vibhav Vineet, Pascal Fua, and Marc Pollefeys. 2021. Learning to Align Sequential Actions in the Wild. CoRR (2021)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Sijie Mai Haifeng Hu and Songlong Xing. 2019. Divide conquer and combine: Hierarchical feature fusion network with local and global perspectives for multimodal affective computing. In ACL.","DOI":"10.18653\/v1\/P19-1046"},{"key":"e_1_3_2_1_21_1","volume-title":"Mapping Estimation for Discrete Optimal Transport","author":"Perrot Micha\u00ebl","unstructured":"Micha\u00ebl Perrot, Nicolas Courty, R\u00e9mi Flamary, and Amaury Habrard. 2016. Mapping Estimation for Discrete Optimal Transport. Le Centre pour la Communication Scientifique Directe - HAL - Universit\u00e9 de Nantes,Le Centre pour la Communication Scientifique Directe - HAL - Universit\u00e9 de Nantes (2016)."},{"key":"e_1_3_2_1_22_1","volume-title":"Superglue: Learning feature matching with graph neural networks. In CVPR.","author":"Sarlin Paul-Edouard","year":"2020","unstructured":"Paul-Edouard Sarlin, Daniel DeTone, Tomasz Malisiewicz, and Andrew Rabinovich. 2020. Superglue: Learning feature matching with graph neural networks. In CVPR."},{"key":"e_1_3_2_1_23_1","unstructured":"Bing Su and Gang Hua. 2017. Order-Preserving Wasserstein Distance for Sequence Matching. In CVPR."},{"key":"e_1_3_2_1_24_1","volume-title":"RpBERT: A Text-image Relation Propagation-based BERT Model for Multimodal NER. AAAI","author":"Sun Lin","year":"2021","unstructured":"Lin Sun, Jiquan Wang, Kai Zhang, Yindu Su, and Fangsheng Weng. 2021. RpBERT: A Text-image Relation Propagation-based BERT Model for Multimodal NER. AAAI (2021)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Shisong Tang Qing Li Xiaoteng Ma Ci Gao Dingmin Wang Yong Jiang Qian Ma Aoyang Zhang and Hechang Chen. 2022. Knowledge-based Temporal Fusion Network for Interpretable Online Video Popularity Prediction. In WWW.","DOI":"10.1145\/3485447.3511934"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Shisong Tang Qing Li Dingmin Wang Ci Gao Wentao Xiao Dan Zhao Yong Jiang Qian Ma and Aoyang Zhang. 2023. Counterfactual Video Recommendation for Duration Debiasing. In KDD.","DOI":"10.1145\/3580305.3599797"},{"key":"e_1_3_2_1_27_1","volume-title":"Vistanet: Visual aspect attention network for multimodal sentiment analysis. In AAAI.","author":"Truong Quoc-Tuan","year":"2019","unstructured":"Quoc-Tuan Truong and Hady W Lauw. 2019. Vistanet: Visual aspect attention network for multimodal sentiment analysis. In AAAI."},{"key":"e_1_3_2_1_28_1","unstructured":"Hanqian Wu Siliang Cheng Jingjing Wang Shoushan Li and Lian Chi. 2020. Multimodal aspect extraction with region-aware alignment network. In NLPCC."},{"key":"e_1_3_2_1_29_1","unstructured":"Yang Wu Yanyan Zhao Hao Yang Song Chen Bing Qin Xiaohuan Cao and Wenting Zhao. 2022. Sentiment Word Aware Multimodal Refinement for Multimodal Sentiment Analysis with ASR Errors. In ACL Findings."},{"key":"e_1_3_2_1_30_1","unstructured":"Zhiwei Wu Changmeng Zheng Yi Cai Junying Chen Ho-fung Leung and Qing Li. 2020. Multimodal representation with embedded visual guiding objects for named entity recognition in social media posts. In ACM MM."},{"key":"e_1_3_2_1_31_1","unstructured":"Renjun Xu Pelen Liu Liyan Wang Chao Chen and Jindong Wang. 2020. Reliable Weighted Optimal Transport for Unsupervised Domain Adaptation. In CVPR."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Hang Yan Junqi Dai Xipeng Qiu Zheng Zhang et al. 2021. A unified generative framework for aspect-based sentiment analysis. arXiv preprint arXiv:2106.04300 (2021).","DOI":"10.18653\/v1\/2021.acl-long.188"},{"key":"e_1_3_2_1_33_1","volume-title":"Cross-Modal Multitask Transformer for End-to-End Multimodal Aspect-Based Sentiment Analysis. Information Processing & Management","author":"Yang Li","year":"2022","unstructured":"Li Yang, Jin-Cheon Na, and Jianfei Yu. 2022. Cross-Modal Multitask Transformer for End-to-End Multimodal Aspect-Based Sentiment Analysis. Information Processing & Management (2022)."},{"key":"e_1_3_2_1_34_1","unstructured":"Jianfei Yu and Jing Jiang. 2019. Adapting BERT for Target-Oriented Multimodal Sentiment Classification. In IJCAI."},{"key":"e_1_3_2_1_35_1","volume-title":"Entity-sensitive attention and fusion network for entity-level multimodal sentiment classification. TASLP","author":"Yu Jianfei","year":"2019","unstructured":"Jianfei Yu, Jing Jiang, and Rui Xia. 2019. Entity-sensitive attention and fusion network for entity-level multimodal sentiment classification. TASLP (2019)."},{"key":"e_1_3_2_1_36_1","unstructured":"Jianfei Yu Jing Jiang Li Yang and Rui Xia. 2020. Improving Multimodal Named Entity Recognition via Entity Span Detection with Unified Multimodal Transformer. In ACL."},{"key":"e_1_3_2_1_37_1","unstructured":"Weijie Yu Liang Pang Jun Xu Bing Su Zhenhua Dong and Ji-Rong Wen. 2022. Optimal Partial Transport Based Sentence Selection for Long-form Document Matching. In COLING."},{"key":"e_1_3_2_1_38_1","unstructured":"Zhewen Yu Jin Wang Liang-Chih Yu and Xuejie Zhang. 2022. Dual-Encoder Transformers with Cross-modal Alignment for Multimodal Aspect-based Sentiment Analysis. In AACL."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Qi Zhang Jinlan Fu Xiaoyu Liu and Xuanjing Huang. 2018. Adaptive co-attention network for named entity recognition in tweets. In AAAI.","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"e_1_3_2_1_40_1","volume-title":"Learning Disentangled Representation for Multimodal Cross-Domain Sentiment Analysis. TNNLS","author":"Zhang Yuhao","year":"2022","unstructured":"Yuhao Zhang, Ying Zhang, Wenya Guo, Xiangrui Cai, and Xiaojie Yuan. 2022. Learning Disentangled Representation for Multimodal Cross-Domain Sentiment Analysis. TNNLS (2022)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Ru Zhou Wenya Guo Xumeng Liu Shenglong Yu Ying Zhang and Xiaojie Yuan. 2023. AoM: Detecting Aspect-oriented Information for Multimodal Aspect-Based Sentiment Analysis. In ACL Findings.","DOI":"10.18653\/v1\/2023.findings-acl.519"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681189","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681189","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681189"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":41,"alternative-id":["10.1145\/3664647.3681189","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681189","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}