{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,6]],"date-time":"2025-11-06T11:45:54Z","timestamp":1762429554141,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,12]],"date-time":"2023-06-12T00:00:00Z","timestamp":1686528000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,12]]},"DOI":"10.1145\/3591106.3592258","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T22:33:38Z","timestamp":1686263618000},"page":"95-104","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["A Unified Model for Video Understanding and Knowledge Embedding with Heterogeneous Knowledge Graph Dataset"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1274-4154","authenticated-orcid":false,"given":"Jiaxin","family":"Deng","sequence":"first","affiliation":[{"name":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5592-8453","authenticated-orcid":false,"given":"Dong","family":"Shen","sequence":"additional","affiliation":[{"name":"MMU KuaiShou Inc., China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6441-892X","authenticated-orcid":false,"given":"Haojie","family":"Pan","sequence":"additional","affiliation":[{"name":"MMU KuaiShou Inc., China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4802-2712","authenticated-orcid":false,"given":"Xiangyu","family":"Wu","sequence":"additional","affiliation":[{"name":"MMU KuaiShou Inc., China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1685-101X","authenticated-orcid":false,"given":"Ximan","family":"Liu","sequence":"additional","affiliation":[{"name":"MMU KuaiShou Inc., China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7103-6321","authenticated-orcid":false,"given":"Gaofeng","family":"Meng","sequence":"additional","affiliation":[{"name":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences; School of Artificial Intelligence, University of Chinese Academy of Sciences; &amp; CAIR, HK Institute of Science and Innovation, Chinese Academy of Sciences"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4570-5885","authenticated-orcid":false,"given":"Fan","family":"Yang","sequence":"additional","affiliation":[{"name":"MMU KuaiShou Inc., China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0310-7751","authenticated-orcid":false,"given":"Tingting","family":"Gao","sequence":"additional","affiliation":[{"name":"MMU KuaiShou Inc., China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9453-9060","authenticated-orcid":false,"given":"Ruiji","family":"Fu","sequence":"additional","affiliation":[{"name":"MMU KuaiShou Inc., China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2987-9672","authenticated-orcid":false,"given":"Zhongyuan","family":"Wang","sequence":"additional","affiliation":[{"name":"MMU KuaiShou Inc., China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,6,12]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Translating embeddings for modeling multi-relational data. Advances in neural information processing systems 26","author":"Bordes Antoine","year":"2013","unstructured":"Antoine Bordes, Nicolas Usunier, Alberto Garcia-Duran, Jason Weston, and Oksana Yakhnenko. 2013. Translating embeddings for modeling multi-relational data. Advances in neural information processing systems 26 (2013)."},{"key":"e_1_3_2_2_2_1","unstructured":"Tom\u00a0B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel\u00a0M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. (2020). arxiv:2005.14165\u00a0[cs.CL]"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-55130-8_12"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"e_1_3_2_2_5_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_6_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475400"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-2024"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.90"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00046"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/2886521.2886624"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.713"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i03.5681"},{"key":"e_1_3_2_2_15_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_2_16_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548257"},{"key":"e_1_3_2_2_18_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van\u00a0den Oord Aaron","year":"2018","unstructured":"Aaron van\u00a0den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00053"},{"key":"e_1_3_2_2_20_1","volume-title":"International Conference on Machine Learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_2_21_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019)."},{"key":"e_1_3_2_2_22_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2014.2327028"},{"key":"e_1_3_2_2_24_1","volume-title":"Rotate: Knowledge graph embedding by relational rotation in complex space. arXiv preprint arXiv:1902.10197","author":"Sun Zhiqing","year":"2019","unstructured":"Zhiqing Sun, Zhi-Hong Deng, Jian-Yun Nie, and Jian Tang. 2019. Rotate: Knowledge graph embedding by relational rotation in complex space. arXiv preprint arXiv:1902.10197 (2019)."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","unstructured":"Zequn Sun Wei Hu Qingheng Zhang and Yuzhong Qu. 2018. Bootstrapping Entity Alignment with Knowledge Graph Embedding.. In IJCAI Vol.\u00a018. 4396\u20134402.","DOI":"10.24963\/ijcai.2018\/611"},{"key":"e_1_3_2_2_26_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W15-4007"},{"key":"e_1_3_2_2_28_1","volume-title":"Visualizing data using t-SNE.Journal of machine learning research 9, 11","author":"Maaten Laurens Van\u00a0der","year":"2008","unstructured":"Laurens Van\u00a0der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE.Journal of machine learning research 9, 11 (2008)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00552"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v28i1.8870"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-60045-1_44"}],"event":{"name":"ICMR '23: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Thessaloniki Greece","acronym":"ICMR '23"},"container-title":["Proceedings of the 2023 ACM International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592258","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3591106.3592258","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:37:30Z","timestamp":1750178250000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592258"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,12]]},"references-count":31,"alternative-id":["10.1145\/3591106.3592258","10.1145\/3591106"],"URL":"https:\/\/doi.org\/10.1145\/3591106.3592258","relation":{},"subject":[],"published":{"date-parts":[[2023,6,12]]},"assertion":[{"value":"2023-06-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}