{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:02:34Z","timestamp":1775815354555,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":95,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62325604,62276271"],"award-info":[{"award-number":["62325604,62276271"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681112","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"1554-1563","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":24,"title":["Simple Yet Effective: Structure Guided Pre-trained Transformer for Multi-modal Knowledge Graph Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4837-455X","authenticated-orcid":false,"given":"Ke","family":"Liang","sequence":"first","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2489-573X","authenticated-orcid":false,"given":"Lingyuan","family":"Meng","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9894-0062","authenticated-orcid":false,"given":"Yue","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3900-4204","authenticated-orcid":false,"given":"Meng","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6653-3788","authenticated-orcid":false,"given":"Wei","family":"Wei","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1481-5393","authenticated-orcid":false,"given":"Suyuan","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1353-2968","authenticated-orcid":false,"given":"Wenxuan","family":"Tu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9517-262X","authenticated-orcid":false,"given":"Siwei","family":"Wang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1491-4594","authenticated-orcid":false,"given":"Sihang","family":"Zhou","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9066-1475","authenticated-orcid":false,"given":"Xinwang","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-up and top-down attention for image captioning and visual question answering. In CVPR. 6077--6086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"e_1_3_2_1_3_1","unstructured":"Hangbo Bao Li Dong Songhao Piao and Furu Wei. 2022. BEiT: BERT Pre-Training of Image Transformers. In ICLR."},{"key":"e_1_3_2_1_4_1","volume-title":"Translating embeddings for modeling multi-relational data. NeurIPs","author":"Bordes Antoine","year":"2013","unstructured":"Antoine Bordes, Nicolas Usunier, Alberto Garcia-Duran, Jason Weston, and Oksana Yakhnenko. 2013. Translating embeddings for modeling multi-relational data. NeurIPs (2013)."},{"key":"e_1_3_2_1_5_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. NeurIPs (2020)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Sanxing Chen Xiaodong Liu Jianfeng Gao Jian Jiao Ruofei Zhang and Yangfeng Ji. 2021. HittER: Hierarchical Transformers for Knowledge Graph Embeddings. In EMNLP.","DOI":"10.18653\/v1\/2021.emnlp-main.812"},{"key":"e_1_3_2_1_7_1","volume-title":"Adaptive Scale and Spatial Aggregation for Real-Time Object Detection. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5.","author":"Chen Wei","year":"2023","unstructured":"Wei Chen, Yulin He, Zhengfa Liang, and Yulan Guo. 2023. Adaptive Scale and Spatial Aggregation for Real-Time Object Detection. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5."},{"key":"e_1_3_2_1_8_1","volume-title":"Unified hallucination detection for multimodal large language models. arXiv preprint arXiv:2402.03190","author":"Chen Xiang","year":"2024","unstructured":"Xiang Chen, Chenxi Wang, Yida Xue, Ningyu Zhang, Xiaoyan Yang, Qiang Li, Yue Shen, Jinjie Gu, and Huajun Chen. 2024. Unified hallucination detection for multimodal large language models. arXiv preprint arXiv:2402.03190 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Xiang Chen Ningyu Zhang Lei Li Shumin Deng Chuanqi Tan Changliang Xu Fei Huang Luo Si and Huajun Chen. 2022. Hybrid Transformer with Multi-Level Fusion for Multimodal Knowledge Graph Completion. In SIGIR.","DOI":"10.1145\/3477495.3531992"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3511998"},{"key":"e_1_3_2_1_11_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","author":"Chen Yen-Chun","year":"2020","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2020. Uniter: Universal image-text representation learning. In ECCV. Springer, 104--120."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-031-37291-9","volume-title":"Knowledge Graphs Meet Multi-Modal Learning: A Comprehensive Survey. CoRR","volume":"2402","author":"Chen Zhuo","year":"2024","unstructured":"Zhuo Chen, Yichi Zhang, Yin Fang, Yuxia Geng, Lingbing Guo, Xiang Chen, Qian Li, Wen Zhang, Jiaoyan Chen, Yushan Zhu, Jiaqi Li, Xiaoze Liu, Jeff Z. Pan, Ningyu Zhang, and Huajun Chen. 2024. Knowledge Graphs Meet Multi-Modal Learning: A Comprehensive Survey. CoRR, Vol. abs\/2402.05391 (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Kewei Cheng Nesreen Ahmed and Yizhou Sun. 2023. Neural Compositional Rule Learning for Knowledge Graph Reasoning. In ICLR.","DOI":"10.1007\/978-3-031-72008-6_5"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Guillem Collell Ted Zhang and Marie-Francine Moens. 2017. Imagined Visual Representations as Multimodal Embeddings. In AAAI.","DOI":"10.1609\/aaai.v31i1.11155"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Tim Dettmers Pasquale Minervini Pontus Stenetorp and Sebastian Riedel. 2018. Convolutional 2d knowledge graph embeddings. In AAAI.","DOI":"10.1609\/aaai.v32i1.11573"},{"key":"e_1_3_2_1_16_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR, Vol. abs\/1810.04805 (2018). [arXiv]1810.04805"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Laura Dietz Alexander Kotov and Edgar Meij. 2018. Utilizing Knowledge Graphs for Text-Centric Information Retrieval. In SIGIR.","DOI":"10.1145\/3209978.3210187"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. https:\/\/doi.org\/10.48550\/ARXIV.2010.11929","DOI":"10.48550\/ARXIV.2010.11929"},{"key":"e_1_3_2_1_19_1","unstructured":"Aleksandr Ermolov Aliaksandr Siarohin Enver Sangineto and Nicu Sebe. 2021. Whitening for self-supervised representation learning. In ICML. 3015--3024."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2024.3406869"},{"key":"e_1_3_2_1_21_1","volume-title":"Better Performance: Efficient Cross-Modal Clip Trimming for Video Moment Retrieval Using Language. In Proceedings of the AAAI Conference on Artificial Intelligence","volume":"38","author":"Fang Xiang","year":"2024","unstructured":"Xiang Fang, Daizong Liu, Wanlong Fang, Pan Zhou, Zichuan Xu, Wenzheng Xu, Junyang Chen, and Renfu Li. 2024. Fewer Steps, Better Performance: Efficient Cross-Modal Clip Trimming for Video Moment Retrieval Using Language. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 38. 1735--1743."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3222965"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00242"},{"key":"e_1_3_2_1_24_1","volume-title":"European Conference on Computer Vision. Springer.","author":"Fang Xiang","year":"2024","unstructured":"Xiang Fang, Zeyu Xiong, Wanlong Fang, Xiaoye Qu, Chen Chen, Jianfeng Dong, Keke Tang, Pan Zhou, Yu Cheng, and Daizong Liu. 2024. Rethinking Weakly-supervised Video Temporal Grounding From a Game Perspective. In European Conference on Computer Vision. Springer."},{"key":"e_1_3_2_1_25_1","volume-title":"Marc Aurelio Ranzato, and Tomas Mikolov","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Greg S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marc Aurelio Ranzato, and Tomas Mikolov. 2013. DeViSE: A Deep Visual-Semantic Embedding Model. In NeurIPs."},{"key":"e_1_3_2_1_26_1","volume-title":"Hamilton","author":"Galkin Mikhail","year":"2022","unstructured":"Mikhail Galkin, Etienne Denis, Jiapeng Wu, and William L. Hamilton. 2022. NodePiece: Compositional and Parameter-Efficient Representations of Large Knowledge Graphs. In ICLR."},{"key":"e_1_3_2_1_27_1","volume-title":"Fashionbert: Text and image matching with adaptive loss for cross-modal retrieval. In SIGIR.","author":"Gao Dehong","year":"2020","unstructured":"Dehong Gao, Linbo Jin, Ben Chen, Minghui Qiu, Peng Li, Yi Wei, Yi Hu, and Hao Wang. 2020. Fashionbert: Text and image matching with adaptive loss for cross-modal retrieval. In SIGIR."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475267"},{"key":"e_1_3_2_1_29_1","volume-title":"High-order Topology for Deep Single-cell Multi-view Fuzzy Clustering","author":"Hu Dayu","year":"2024","unstructured":"Dayu Hu, Zhibin Dong, Ke Liang, Hao Yu, Siwei Wang, and Xinwang Liu. 2024. High-order Topology for Deep Single-cell Multi-view Fuzzy Clustering. IEEE Transactions on Fuzzy Systems (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1093\/bib\/bbae102"},{"key":"e_1_3_2_1_31_1","volume-title":"Pan","author":"Hu Zhiwei","year":"2022","unstructured":"Zhiwei Hu, V\u00edctor Guti\u00e9rrez-Basulto, Zhiliang Xiang, Ru Li, and Jeff Z. Pan. 2022. Transformer-based Entity Typing in Knowledge Graphs. (2022)."},{"key":"e_1_3_2_1_32_1","volume-title":"Hongjian Dou, Ji-Rong Wen, and Edward Y. Chang.","author":"Huang Jin","year":"2018","unstructured":"Jin Huang, Wayne Xin Zhao, Hongjian Dou, Ji-Rong Wen, and Edward Y. Chang. 2018. Improving Sequential Recommendation with Knowledge-Enhanced Memory Networks. In SIGIR."},{"key":"e_1_3_2_1_33_1","volume-title":"Manning","author":"Hudson Drew A.","year":"2019","unstructured":"Drew A. Hudson and Christopher D. Manning. 2019. GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering. In CVPR."},{"key":"e_1_3_2_1_34_1","volume-title":"Adam: A method for stochastic optimization. arXiv","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv (2014)."},{"key":"e_1_3_2_1_35_1","unstructured":"Rik Koncel-Kedziorski Dhanush Bekal Yi Luan Maria Lapata and Hannaneh Hajishirzi. 2019. Text Generation from Knowledge Graphs with Graph Transformers. In NAACL."},{"key":"e_1_3_2_1_36_1","volume-title":"Unicoder-vl: A universal encoder for vision and language by cross-modal pre-training. In AAAI.","author":"Li Gen","year":"2020","unstructured":"Gen Li, Nan Duan, Yuejian Fang, Ming Gong, and Daxin Jiang. 2020. Unicoder-vl: A universal encoder for vision and language by cross-modal pre-training. In AAAI."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539426"},{"key":"e_1_3_2_1_38_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_39_1","volume-title":"Multi-modal knowledge graph transformer framework for multi-modal entity alignment. arXiv preprint arXiv:2310.06365","author":"Li Qian","year":"2023","unstructured":"Qian Li, Cheng Ji, Shu Guo, Zhaoji Liang, Lihong Wang, and Jianxin Li. 2023. Multi-modal knowledge graph transformer framework for multi-modal entity alignment. arXiv preprint arXiv:2310.06365 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616855.3635757"},{"key":"e_1_3_2_1_41_1","volume-title":"House: Knowledge graph embedding with householder parameterization. In ICML.","author":"Li Rui","year":"2022","unstructured":"Rui Li, Jianan Zhao, Chaozhuo Li, Di He, Yiqi Wang, Yuming Liu, Hao Sun, Senzhang Wang, Weiwei Deng, Yanming Shen, et al. 2022. House: Knowledge graph embedding with householder parameterization. In ICML."},{"key":"e_1_3_2_1_42_1","volume-title":"SWEA: Updating Factual Knowledge in Large Language Models via Subject Word Embedding Altering. arxiv: 2401.17809 [cs.CL] https:\/\/arxiv.org\/abs\/2401.17809","author":"Li Xiaopeng","year":"2024","unstructured":"Xiaopeng Li, Shasha Li, Shezheng Song, Huijun Liu, Bin Ji, Xi Wang, Jun Ma, Jie Yu, Xiaodong Liu, Jing Wang, and Weimin Zhang. 2024. SWEA: Updating Factual Knowledge in Large Language Models via Subject Word Embedding Altering. arxiv: 2401.17809 [cs.CL] https:\/\/arxiv.org\/abs\/2401.17809"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29818"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2023.3282989"},{"key":"e_1_3_2_1_45_1","volume-title":"A survey of knowledge graph reasoning on graph types: Static, dynamic, and multi-modal","author":"Liang Ke","year":"2024","unstructured":"Ke Liang, Lingyuan Meng, Meng Liu, Yue Liu, Wenxuan Tu, Siwei Wang, Sihang Zhou, Xinwang Liu, Fuchun Sun, and Kunlun He. 2024. A survey of knowledge graph reasoning on graph types: Static, dynamic, and multi-modal. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611853"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2024.3386168"},{"key":"e_1_3_2_1_48_1","volume-title":"Deep Temporal Graph Clustering. In The 12th International Conference on Learning Representations.","author":"Liu Meng","year":"2024","unstructured":"Meng Liu, Yue Liu, Ke Liang, Wenxuan Tu, Siwei Wang, Sihang Zhou, and Xinwang Liu. 2024. Deep Temporal Graph Clustering. In The 12th International Conference on Learning Representations."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02471"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Xiao Liu Shiyu Zhao Kai Su Yukuo Cen Jiezhong Qiu Mengdi Zhang Wei Wu Yuxiao Dong and Jie Tang. 2022. Mask and Reason: Pre-Training Knowledge Graph Transformers for Complex Logical Queries. In SIGKDD.","DOI":"10.1145\/3534678.3539472"},{"key":"e_1_3_2_1_51_1","volume-title":"Proc. of ICML.","author":"Liu Yue","unstructured":"Yue Liu, Ke Liang, Jun Xia, Sihang Zhou, Xihong Yang, Xinwang Liu, and Z. Stan Li. 2023. Dink-Net: Neural Clustering on Large Graphs. In Proc. of ICML."},{"key":"e_1_3_2_1_52_1","volume-title":"NeurIPs","volume":"32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. NeurIPs, Vol. 32 (2019)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3393148"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/S18-2027"},{"key":"e_1_3_2_1_55_1","volume-title":"Embedding multimodal relational data for knowledge base completion. arXiv preprint arXiv:1809.01341","author":"Pezeshkpour Pouya","year":"2018","unstructured":"Pouya Pezeshkpour, Liyan Chen, and Sameer Singh. 2018. Embedding multimodal relational data for knowledge base completion. arXiv preprint arXiv:1809.01341 (2018)."},{"key":"e_1_3_2_1_56_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog (2019)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Michael Schlichtkrull Thomas N Kipf Peter Bloem Rianne van den Berg Ivan Titov and Max Welling. 2018. Modeling relational data with graph convolutional networks. In ESWC.","DOI":"10.1007\/978-3-319-93417-4_38"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680954"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.3233\/FAIA240612"},{"key":"e_1_3_2_1_60_1","volume-title":"Vl-bert: Pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530","author":"Su Weijie","year":"2019","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2019. Vl-bert: Pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530 (2019)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"Haohai Sun Shangyi Geng Jialun Zhong Han Hu and Kun He. 2022. Graph Hawkes Transformer for Extrapolated Reasoning on Temporal Knowledge Graphs. In EMNLP.","DOI":"10.18653\/v1\/2022.emnlp-main.507"},{"key":"e_1_3_2_1_62_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv (2019)."},{"key":"e_1_3_2_1_63_1","volume-title":"Hamilton","author":"Teru Komal K.","year":"2020","unstructured":"Komal K. Teru, Etienne Denis, and William L. Hamilton. 2020. Inductive Relation Prediction by Subgraph Reasoning. ICML (2020)."},{"key":"e_1_3_2_1_64_1","volume-title":"Mehdi Azabou, Eva L Dyer, Remi Munos, Petar Velivckovi\u0107, and Michal Valko.","author":"Thakoor Shantanu","year":"2021","unstructured":"Shantanu Thakoor, Corentin Tallec, Mohammad Gheshlaghi Azar, Mehdi Azabou, Eva L Dyer, Remi Munos, Petar Velivckovi\u0107, and Michal Valko. 2021. Large-scale representation learning on graphs via bootstrapping. arXiv (2021)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"crossref","unstructured":"Steffen Thoma Achim Rettinger and Fabian Both. 2017. Towards holistic concept representations: Embedding relational knowledge visual attributes and distributional word semantics. In ISWC. 694--710.","DOI":"10.1007\/978-3-319-68288-4_41"},{"key":"e_1_3_2_1_66_1","unstructured":"Th\u00e9o Trouillon Johannes Welbl Sebastian Riedel \u00c9ric Gaussier and Guillaume Bouchard. 2016. Complex embeddings for simple link prediction. In ICML."},{"key":"e_1_3_2_1_67_1","volume-title":"Composition-based multi-relational graph convolutional networks. arXiv","author":"Vashishth Shikhar","year":"2019","unstructured":"Shikhar Vashishth, Soumya Sanyal, Vikram Nitin, and Partha Talukdar. 2019. Composition-based multi-relational graph convolutional networks. arXiv (2019)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2024.3378194"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3388974"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.295"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"Meng Wang Sen Wang Han Yang Zheng Zhang Xi Chen and Guilin Qi. 2021. Is Visual Context Really Helpful for Knowledge Graph? A Representation Learning Perspective. In ACM MM.","DOI":"10.1145\/3474085.3475470"},{"key":"e_1_3_2_1_72_1","volume-title":"Language models as knowledge embeddings. arXiv preprint arXiv:2206.12617","author":"Wang Xintao","year":"2022","unstructured":"Xintao Wang, Qianyu He, Jiaqing Liang, and Yanghua Xiao. 2022. Language models as knowledge embeddings. arXiv preprint arXiv:2206.12617 (2022)."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"crossref","unstructured":"Zikang Wang Linjing Li Qiudan Li and Daniel Zeng. 2019. Multimodal Data Enhanced Representation Learning for Knowledge Graphs. In IJCNN.","DOI":"10.1109\/IJCNN.2019.8852079"},{"key":"e_1_3_2_1_74_1","unstructured":"Ruobing Xie Zhiyuan Liu Jia Jia Huanbo Luan and Maosong Sun. 2016. Representation Learning of Knowledge Graphs with Entity Descriptions. In AAAI."},{"key":"e_1_3_2_1_75_1","unstructured":"Ruobing Xie Zhiyuan Liu Huanbo Luan and Maosong Sun. 2017. Image-Embodied Knowledge Representation Learning. In IJCAI."},{"key":"e_1_3_2_1_76_1","unstructured":"Derong Xu Tong Xu Shiwei Wu Jingbo Zhou and Enhong Chen. 2022. Relation-Enhanced Negative Sampling for Multimodal Knowledge Graph Completion. In ACM MM."},{"key":"e_1_3_2_1_77_1","volume-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024","author":"Xu Derong","year":"2024","unstructured":"Derong Xu, Ziheng Zhang, Zhenxi Lin, Xian Wu, Zhihong Zhu, Tong Xu, Xiangyu Zhao, Yefeng Zheng, and Enhong Chen. 2024. Multi-perspective Improvement of Knowledge Graph Completion with Large Language Models. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024). 11956--11968."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i9.26285"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611809"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"crossref","unstructured":"Zuoxi Yang. 2020. Biomedical Information Retrieval Incorporating Knowledge Graph for Explainable Precision Medicine. In SIGIR.","DOI":"10.1145\/3397271.3401458"},{"key":"e_1_3_2_1_81_1","volume-title":"KG-BERT: BERT for knowledge graph completion. arXiv","author":"Yao Liang","year":"2019","unstructured":"Liang Yao, Chengsheng Mao, and Yuan Luo. 2019. KG-BERT: BERT for knowledge graph completion. arXiv (2019)."},{"key":"e_1_3_2_1_82_1","unstructured":"Chengqing Yu Fei Wang Zezhi Shao Tangwen Qian Zhao Zhang Wei Wei and Yongjun Xu. 2024. GinAR: An End-To-End Multivariate Time Series Forecasting Model Suitable for Variable Missing. arxiv: 2405.11333 [cs.LG]"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3614851"},{"key":"e_1_3_2_1_84_1","volume-title":"Learning to Walk with Dual Agents for Knowledge Graph Reasoning. arXiv preprint arXiv:2112.12876","author":"Zhang Denghui","year":"2021","unstructured":"Denghui Zhang, Zixuan Yuan, Hao Liu, Xiaodong Lin, and Hui Xiong. 2021. Learning to Walk with Dual Agents for Knowledge Graph Reasoning. arXiv preprint arXiv:2112.12876 (2021)."},{"key":"e_1_3_2_1_85_1","volume-title":"A Fully Test-Time Training Framework for Semi-Supervised Node Classification on Out-of-Distribution Graphs. ACM Transactions on Knowledge Discovery from Data","author":"Zhang Jiaxin","year":"2024","unstructured":"Jiaxin Zhang, Yiqi Wang, Xihong Yang, and En Zhu. 2024. A Fully Test-Time Training Framework for Semi-Supervised Node Classification on Out-of-Distribution Graphs. ACM Transactions on Knowledge Discovery from Data (2024)."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"crossref","unstructured":"Ningyu Zhang QiangHuai Jia Shumin Deng Xiang Chen Hongbin Ye Hui Chen Huaixiao Tou Gang Huang Zhao Wang Nengwei Hua and Huajun Chen. 2021. AliCG: Fine-Grained and Evolvable Conceptual Graph Construction for Semantic Search at Alibaba. In SIGKDD.","DOI":"10.1145\/3447548.3467057"},{"key":"e_1_3_2_1_87_1","volume-title":"Quaternion knowledge graph embeddings. Advances in neural information processing systems","author":"Zhang Shuai","year":"2019","unstructured":"Shuai Zhang, Yi Tay, Lina Yao, and Qi Liu. 2019. Quaternion knowledge graph embeddings. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_88_1","volume-title":"MyGO: Discrete Modality Information as Fine-Grained Tokens for Multi-modal Knowledge Graph Completion. arXiv preprint arXiv:2404.09468","author":"Zhang Yichi","year":"2024","unstructured":"Yichi Zhang, Zhuo Chen, Lingbing Guo, Yajing Xu, Binbin Hu, Ziqi Liu, Huajun Chen, and Wen Zhang. 2024. MyGO: Discrete Modality Information as Fine-Grained Tokens for Multi-modal Knowledge Graph Completion. arXiv preprint arXiv:2404.09468 (2024)."},{"key":"e_1_3_2_1_89_1","volume-title":"Unleashing the Power of Imbalanced Modality Information for Multi-modal Knowledge Graph Completion. arXiv preprint arXiv:2402.15444","author":"Zhang Yichi","year":"2024","unstructured":"Yichi Zhang, Zhuo Chen, Lei Liang, Huajun Chen, and Wen Zhang. 2024. Unleashing the Power of Imbalanced Modality Information for Multi-modal Knowledge Graph Completion. arXiv preprint arXiv:2402.15444 (2024)."},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"crossref","unstructured":"Zhanqiu Zhang Jianyu Cai Yongdong Zhang and Jie Wang. 2020. Learning Hierarchy-Aware Knowledge Graph Embeddings for Link Prediction. In AAAI.","DOI":"10.1609\/aaai.v34i03.5701"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"crossref","unstructured":"Yu Zhao Xiangrui Cai Yike Wu Haiwei Zhang Ying Zhang Guoqing Zhao and Ning Jiang. 2022. MoSE: Modality Split and Ensemble for Multimodal Knowledge Graph Completion. In EMNLP.","DOI":"10.18653\/v1\/2022.emnlp-main.719"},{"key":"e_1_3_2_1_92_1","volume-title":"MMKGR: Multi-hop Multi-modal Knowledge Graph Reasoning. ArXiv","author":"Zheng Shangfei","year":"2022","unstructured":"Shangfei Zheng, Weiqing Wang, Jianfeng Qu, Hongzhi Yin, Wei Chen, and Lei Zhao. 2022. MMKGR: Multi-hop Multi-modal Knowledge Graph Reasoning. ArXiv, Vol. abs\/2209.01416 (2022)."},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3238727"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3223955"},{"key":"e_1_3_2_1_95_1","volume-title":"Multimodal joint attribute prediction and value extraction for e-commerce product. arXiv preprint arXiv:2009.07162","author":"Zhu Tiangang","year":"2020","unstructured":"Tiangang Zhu, Yue Wang, Haoran Li, Youzheng Wu, Xiaodong He, and Bowen Zhou. 2020. Multimodal joint attribute prediction and value extraction for e-commerce product. arXiv preprint arXiv:2009.07162 (2020)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681112","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681112","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:53Z","timestamp":1750294673000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681112"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":95,"alternative-id":["10.1145\/3664647.3681112","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681112","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}