{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:43Z","timestamp":1765357723654,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["72071116"],"award-info":[{"award-number":["72071116"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100007928","name":"Ningbo Municipal Bureau of Science and Technology","doi-asserted-by":"publisher","award":["2022Z173, 2022Z217, 2023Z138, 2023Z237, 2024Z110"],"award-info":[{"award-number":["2022Z173, 2022Z217, 2023Z138, 2023Z237, 2024Z110"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100007928","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680820","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"4650-4659","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Visual-linguistic Cross-domain Feature Learning with Group Attention and Gamma-correct Gated Fusion for Extracting Commonsense Knowledge"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9539-6789","authenticated-orcid":false,"given":"Jialu","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Nottingham Ningbo China, Ningbo, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1003-1726","authenticated-orcid":false,"given":"Xinyi","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Nottingham Ningbo China, Ningbo, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3056-8265","authenticated-orcid":false,"given":"Chenglin","family":"Yao","sequence":"additional","affiliation":[{"name":"University of Nottingham Ningbo China, Ningbo, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4619-6590","authenticated-orcid":false,"given":"Jianfeng","family":"Ren","sequence":"additional","affiliation":[{"name":"University of Nottingham Ningbo China, Ningbo, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9104-2315","authenticated-orcid":false,"given":"Xudong","family":"Jiang","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i11.21496"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Shivangi Bithel and Srikanta Bedathur. 2023. Evaluating Cross-modal Generative Models Using Retrieval Task. In ACM SIGIR. 1960--1965.","DOI":"10.1145\/3539618.3591979"},{"key":"e_1_3_2_1_3_1","volume-title":"COMET: Commonsense Transformers for Automatic Knowledge Graph Construction. In ACL. 4762--4779.","author":"Bosselut Antoine","year":"2019","unstructured":"Antoine Bosselut, Hannah Rashkin, Maarten Sap, Chaitanya Malaviya, Asli Celikyilmaz, and Yejin Choi. 2019. COMET: Commonsense Transformers for Automatic Knowledge Graph Construction. In ACL. 4762--4779."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Xiang Chen Ningyu Zhang Lei Li Yunzhi Yao Shumin Deng Chuanqi Tan Fei Huang Luo Si and Huajun Chen. 2022. Good Visual Guidance Make A Better Extractor: Hierarchical Visual Prefix for Multimodal Entity and Relation Extraction. In NAACL. 1607--1618.","DOI":"10.18653\/v1\/2022.findings-naacl.121"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Bo Dai Yuqi Zhang and Dahua Lin. 2017. Detecting visual relationships with deep relational networks. In CVPR. 3076--3086.","DOI":"10.1109\/CVPR.2017.352"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3615355"},{"key":"e_1_3_2_1_7_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HL. 4171--4186.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HL. 4171--4186."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Xiaohan Ding Honghao Chen Xiangyu Zhang Jungong Han and Guiguang Ding. 2022. RepMLPNet: Hierarchical vision mlp with re-parameterized locality. In CVPR. 578--587.","DOI":"10.1109\/CVPR52688.2022.00066"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3284474"},{"key":"e_1_3_2_1_10_1","volume-title":"Seat belt detection using gated Bi-LSTM with part-to-whole attention on diagonally sampled patches. ESWA 123784","author":"Gu Xinyu","year":"2024","unstructured":"Xinyu Gu, Zheng Lu, Jianfeng Ren, and Qian Zhang. 2024. Seat belt detection using gated Bi-LSTM with part-to-whole attention on diagonally sampled patches. ESWA 123784 (2024)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Wentao He Yuchen Yan Jianfeng Ren Ruibin Bai and Xudong Jiang. 2024. Multi-View Spectrogram Transformer for Respiratory Sound Classification. In ICASSP. 8626--8630.","DOI":"10.1109\/ICASSP48485.2024.10445825"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25072"},{"key":"e_1_3_2_1_13_1","volume-title":"Yu","author":"Hu Xuming","year":"2023","unstructured":"Xuming Hu, Junzhe Chen, Aiwei Liu, Shiao Meng, Lijie Wen, and Philip S. Yu. 2023. Prompt me up: Unleashing the power of alignments for multimodal entity and relation extraction. In ACM MM. 5185--5194."},{"key":"e_1_3_2_1_14_1","volume-title":"KBGN: Knowledge-Bridge Graph Network for Adaptive Vision-Text Reasoning in Visual Dialogue. In ACM MM. 1265--1273.","author":"Jiang Xiaoze","year":"2020","unstructured":"Xiaoze Jiang, Siyi Du, Zengchang Qin, Yajing Sun, and Jing Yu. 2020. KBGN: Knowledge-Bridge Graph Network for Adaptive Vision-Text Reasoning in Visual Dialogue. In ACM MM. 1265--1273."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Guohao Li Xin Wang and Wenwu Zhu. 2020. Boosting Visual Question Answering with Context-aware Knowledge Aggregation. In ACM MM. 1227--1235.","DOI":"10.1145\/3394171.3413943"},{"key":"e_1_3_2_1_17_1","unstructured":"Xiang Li Aynaz Taheri Lifu Tu and Kevin Gimpel. 2016. Commonsense knowledge base completion. In ACL. 1445--1455."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Zhenyang Li Yangyang Guo Kejie Wang Xiaolin Chen Liqiang Nie and Mohan Kankanhalli. 2023. Do Vision-Language Transformers Exhibit Visual Commonsense? An Empirical Study of VCR. In ACM MM. 5634--5644.","DOI":"10.1145\/3581783.3612395"},{"key":"e_1_3_2_1_19_1","unstructured":"Bill Yuchen Lin Seyeon Lee Rahul Khanna and Xiang Ren. 2020. Birds have four legs?! NumerSense: Probing numerical commonsense knowledge of pre-trained language models. In EMNLP. 6862--6868."},{"key":"e_1_3_2_1_20_1","unstructured":"Yankai Lin Shiqi Shen Zhiyuan Liu Huanbo Luan and Maosong Sun. 2016. Neural Relation Extraction with Selective Attention over Instances. In ACL. 2124--2133."},{"key":"e_1_3_2_1_21_1","volume-title":"Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio.","author":"Lin Zhouhan","year":"2017","unstructured":"Zhouhan Lin, Minwei Feng, C\u00edcero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. 2017. A structured self-attentive sentence embedding. In ICLR. 1--15."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Xiao Liu Da Yin Yansong Feng and Dongyan Zhao. 2022. Things not Written in Text: Exploring Spatial Commonsense from Visual Signals. In ACL. 2365--2376.","DOI":"10.18653\/v1\/2022.acl-long.168"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26565"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Xusheng Luo Le Bo Jinhang Wu Lin Li Zhiy Luo Yonghua Yang and Keping Yang. 2021. AliCoCo2: Commonsense knowledge extraction representation and application in e-commerce. In SIGKDD. 3385--3393.","DOI":"10.1145\/3447548.3467203"},{"key":"e_1_3_2_1_25_1","volume-title":"Multi-source knowledge reasoning graph network for multi-modal commonsense inference. TOMMCCAP 19, 141","author":"Ma Xuan","year":"2023","unstructured":"Xuan Ma, Xiaoshan Yang, and Changsheng Xu. 2023. Multi-source knowledge reasoning graph network for multi-modal commonsense inference. TOMMCCAP 19, 141 (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2022.3206505"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Cory Paik St\u00e9phane Aroca-Ouellette Alessandro Roncone and Katharina Kann. 2021. The world of an octopus: How reporting bias influences a language model?s perception of color. In EMNLP. 823--835.","DOI":"10.18653\/v1\/2021.emnlp-main.63"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Hao Peng Tianyu Gao Xu Han Yankai Lin Peng Li Zhiyuan Liu Maosong Sun and Jie Zhou. 2020. Learning from context or names? An empirical study on neural relation extraction. In EMNLP. 3661--3672.","DOI":"10.18653\/v1\/2020.emnlp-main.298"},{"key":"e_1_3_2_1_29_1","volume-title":"Glove: Global vectors for word representation. In EMNLP. 1532--1543.","author":"Pennington Jeffrey","year":"2014","unstructured":"Jeffrey Pennington, Richard Socher, and Christopher D Manning. 2014. Glove: Global vectors for word representation. In EMNLP. 1532--1543."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Fabio Petroni Tim Rockt\u00e4schel Sebastian Riedel Patrick Lewis Anton Bakhtin YuxiangWu and Alexander Miller. 2019. Language Models as Knowledge Bases?. In EMNLP-IJCNLP. 2463--2473.","DOI":"10.18653\/v1\/D19-1250"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Wenqi Ren Lin Ma Jiawei Zhang Jinshan Pan Xiaochun Cao Wei Liu and Ming-Hsuan Yang. 2018. Gated fusion network for single image dehazing. In CVPR. 3253--3261.","DOI":"10.1109\/CVPR.2018.00343"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013027"},{"volume-title":"Proc. 4th Workshop Vis. Lang. 70--80","author":"Schuster Sebastian","key":"e_1_3_2_1_33_1","unstructured":"Sebastian Schuster, Ranjay Krishna, Angel Chang, Li Fei-Fei, and Christopher D. Manning. 2015. Generating Semantically Precise Scene Graphs from Textual Descriptions for Improved Image Retrieval. In Proc. 4th Workshop Vis. Lang. 70--80."},{"key":"e_1_3_2_1_34_1","volume-title":"ViT-GPT2 Image Captioning. Huggingface","author":"Shieh Yih-Dar","year":"2022","unstructured":"Yih-Dar Shieh. 2022. ViT-GPT2 Image Captioning. Huggingface (2022). https: \/\/huggingface.co\/nlpconnect\/vit-gpt2-image-captioning"},{"key":"e_1_3_2_1_35_1","volume-title":"Chandra Bhagavatula, and Yejin Choi.","author":"Shwartz Vered","year":"2020","unstructured":"Vered Shwartz, Peter West, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. 2020. Unsupervised Commonsense Question Answering with Self-Talk. In EMNLP. 4615--4629."},{"key":"e_1_3_2_1_36_1","volume-title":"DOCK: Detecting Objects by transferring Common-sense Knowledge. In ECCV. 492--508.","author":"Singh Krishna Kumar","year":"2018","unstructured":"Krishna Kumar Singh, Santosh Divvala, Ali Farhadi, and Yong Jae Lee. 2018. DOCK: Detecting Objects by transferring Common-sense Knowledge. In ECCV. 492--508."},{"key":"e_1_3_2_1_37_1","volume-title":"An open multilingual graph of general knowledge. AAAI 31","author":"Speer Robyn","year":"2017","unstructured":"Robyn Speer, Joshua Chin, and Catherine Havasi. 2017. ConceptNet 5.5: An open multilingual graph of general knowledge. AAAI 31 (2017)."},{"key":"e_1_3_2_1_38_1","first-page":"24261","article-title":"MLP-Mixer: An all-MLP Architecture for Vision","volume":"34","author":"Tolstikhin Ilya O","year":"2021","unstructured":"Ilya O Tolstikhin, Neil Houlsby, Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Thomas Unterthiner, Jessica Yung, Andreas Steiner, Daniel Keysers, Jakob Uszkoreit, Mario Lucic, and Alexey Dosovitskiy. 2021. MLP-Mixer: An all-MLP Architecture for Vision. In NeurIPS, Vol. 34. 24261--24272.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/2629489"},{"key":"e_1_3_2_1_40_1","first-page":"4313","article-title":"Mask Attack Detection Using Vascular-Weighted Motion-Robust rPPG Signals","volume":"18","author":"Yao Chenglin","year":"2023","unstructured":"Chenglin Yao, Jianfeng Ren, Ruibin Bai, Heshan Du, Jiang Liu, and Xudong Jiang. 2023. Mask Attack Detection Using Vascular-Weighted Motion-Robust rPPG Signals. TIFS 18 (2023), 4313--4328.","journal-title":"TIFS"},{"key":"e_1_3_2_1_41_1","volume-title":"Progressively-orthogonally-mapped EfficientNet for action recognition on time-range-Doppler signature. ESWA 124824","author":"Yao Chenglin","year":"2024","unstructured":"Chenglin Yao, Jianfeng Ren, Ruibin Bai, Heshan Du, Jiang Liu, and Xudong Jiang. 2024. Progressively-orthogonally-mapped EfficientNet for action recognition on time-range-Doppler signature. ESWA 124824 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i5.25809"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Yuan Yao Ao Zhang Xu Han Mengdi Li Cornelius Weber Zhiyuan Liu Stefan Wermter and Maosong Sun. 2021. Visual Distant Supervision for Scene Graph Generation. In ICCV. 15816--15826.","DOI":"10.1109\/ICCV48922.2021.01552"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Mark Yatskar Vicente Ordonez and Ali Farhadi. 2016. Stating the obvious: Extracting visual common sense knowledge. In NAACL-HLT. 193--198.","DOI":"10.18653\/v1\/N16-1023"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Shuhong Ye Weikai Kong Chenglin Yao Jianfeng Ren and Xudong Jiang. 2023. Video Question Answering Using Clip-Guided Visual-Text Attention. In ICIP. 81--85.","DOI":"10.1109\/ICIP49359.2023.10222286"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eij.2012.08.002"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Daojian Zeng Kang Liu Yubo Chen and Jun Zhao. 2015. Distant Supervision for Relation Extraction via Piecewise Convolutional Neural Networks. In EMNLP. 1753--1762.","DOI":"10.18653\/v1\/D15-1203"},{"key":"e_1_3_2_1_48_1","volume-title":"Zhuowan Li, and Elias Stengel-Eskin.","author":"Zhang Chenyu","year":"2022","unstructured":"Chenyu Zhang, Benjamin Van Durme, Zhuowan Li, and Elias Stengel-Eskin. 2022. Visual Commonsense in Pretrained Unimodal and Multimodal Models. In NAACL-HL. 5321--5335."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3266161"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i1.27795"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Pengchuan Zhang Xiujun Li Xiaowei Hu Jianwei Yang Lei Zhang Lijuan Wang Yejin Choi and Jianfeng Gao. 2021. VinVL: Revisiting visual representations in vision-language models. In CVPR. 5579--5588.","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Shan Zhao Minghao Hu Zhiping Cai and Fang Liu. 2021. Modeling dense cross-modal interactions for joint entity-relation extraction. In IJCAI. 4032--4038.","DOI":"10.24963\/ijcai.2020\/558"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Changmeng Zheng Junhao Feng Ze Fu Yi Cai Qing Li and Tao Wang. 2021. Multimodal Relation Extraction with Efficient Graph Alignment. In ACM MM. 5298--5306.","DOI":"10.1145\/3474085.3476968"},{"key":"e_1_3_2_1_54_1","volume-title":"MNRE: A challenge multimodal dataset for neural relation extraction with visual evidence in social media posts. In ICME. 1--6.","author":"Zheng Changmeng","year":"2021","unstructured":"Changmeng Zheng, Zhiwei Wu, Junhao Feng, Ze Fu, and Yi Cai. 2021. MNRE: A challenge multimodal dataset for neural relation extraction with visual evidence in social media posts. In ICME. 1--6."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680820","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680820","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:07Z","timestamp":1750295887000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680820"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":54,"alternative-id":["10.1145\/3664647.3680820","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680820","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}