{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:40:11Z","timestamp":1755877211023,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":26,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,12,8]],"date-time":"2023-12-08T00:00:00Z","timestamp":1701993600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,8]]},"DOI":"10.1145\/3638584.3638635","type":"proceedings-article","created":{"date-parts":[[2024,3,14]],"date-time":"2024-03-14T11:15:19Z","timestamp":1710414919000},"page":"408-413","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Ontology-Semantic Alignment On Contrastive Video-Language Model for Multimodel Video Retrieval Task"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2244-1945","authenticated-orcid":false,"given":"Yifan","family":"Zhou","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, State Key Lab of Advanced Optical Communication System and Network, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5166-5753","authenticated-orcid":false,"given":"Yizhou","family":"Ding","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, State Key Lab of Advanced Optical Communication System and Network, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3392-4004","authenticated-orcid":false,"given":"Yuwu","family":"Dong","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, State Key Lab of Advanced Optical Communication System and Network, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4851-7012","authenticated-orcid":false,"given":"Hao","family":"He","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, State Key Lab of Advanced Optical Communication System and Network, China"}]}],"member":"320","published-online":{"date-parts":[[2024,3,14]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Tagging before Alignment: Integrating Multi-Modal Tags for Video-Text Retrieval. arXiv preprint arXiv:2301.12644","author":"Chen Yizhen","year":"2023","unstructured":"Yizhen Chen, Jie Wang, Lijian Lin, Zhongang Qi, Jin Ma, and Ying Shan. 2023. Tagging before Alignment: Integrating Multi-Modal Tags for Video-Text Retrieval. arXiv preprint arXiv:2301.12644 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.127"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-023-02036-y"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the thirteenth international conference on artificial intelligence and statistics. JMLR Workshop and Conference Proceedings, 297\u2013304","author":"Gutmann Michael","year":"2010","unstructured":"Michael Gutmann and Aapo Hyv\u00e4rinen. 2010. Noise-contrastive estimation: A new estimation principle for unnormalized statistical models. In Proceedings of the thirteenth international conference on artificial intelligence and statistics. JMLR Workshop and Conference Proceedings, 297\u2013304."},{"key":"e_1_3_2_1_5_1","volume-title":"Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654","author":"He Pengcheng","year":"2020","unstructured":"Pengcheng He, Xiaodong Liu, Jianfeng Gao, and Weizhu Chen. 2020. Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654 (2020)."},{"key":"e_1_3_2_1_6_1","volume-title":"Segment anything. arXiv preprint arXiv:2304.02643","author":"Kirillov Alexander","year":"2023","unstructured":"Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander\u00a0C Berg, Wan-Yen Lo, 2023. Segment anything. arXiv preprint arXiv:2304.02643 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Estimating mutual information. Physical review E 69, 6","author":"Kraskov Alexander","year":"2004","unstructured":"Alexander Kraskov, Harald St\u00f6gbauer, and Peter Grassberger. 2004. Estimating mutual information. Physical review E 69, 6 (2004), 066138."},{"key":"e_1_3_2_1_8_1","unstructured":"Alex Krizhevsky Geoffrey Hinton 2009. Learning multiple layers of features from tiny images. (2009)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096391"},{"key":"e_1_3_2_1_10_1","volume-title":"International Conference on Machine Learning. PMLR, 12888\u201312900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888\u201312900."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01524"},{"key":"e_1_3_2_1_12_1","volume-title":"Mediapipe: A framework for building perception pipelines. arXiv preprint arXiv:1906.08172","author":"Lugaresi Camillo","year":"2019","unstructured":"Camillo Lugaresi, Jiuqiang Tang, Hadon Nash, Chris McClanahan, Esha Uboweja, Michael Hays, Fan Zhang, Chuo-Ling Chang, Ming\u00a0Guang Yong, Juhyun Lee, 2019. Mediapipe: A framework for building perception pipelines. arXiv preprint arXiv:1906.08172 (2019)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMLA51294.2020.00014"},{"key":"e_1_3_2_1_14_1","volume-title":"OWL web ontology language overview. W3C recommendation 10, 10","author":"McGuinness L","year":"2004","unstructured":"Deborah\u00a0L McGuinness, Frank Van\u00a0Harmelen, 2004. OWL web ontology language overview. W3C recommendation 10, 10 (2004), 2004."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.111"},{"key":"e_1_3_2_1_18_1","volume-title":"International conference on machine learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00554"},{"key":"e_1_3_2_1_20_1","volume-title":"You only learn one representation: Unified network for multiple tasks. arXiv preprint arXiv:2105.04206","author":"Wang Chien-Yao","year":"2021","unstructured":"Chien-Yao Wang, I-Hau Yeh, and Hong-Yuan\u00a0Mark Liao. 2021. You only learn one representation: Unified network for multiple tasks. arXiv preprint arXiv:2105.04206 (2021)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095026"},{"key":"e_1_3_2_1_22_1","volume-title":"Videoclip: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084","author":"Xu Hu","year":"2021","unstructured":"Hu Xu, Gargi Ghosh, Po-Yao Huang, Dmytro Okhonko, Armen Aghajanyan, Florian Metze, Luke Zettlemoyer, and Christoph Feichtenhofer. 2021. Videoclip: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462874"},{"key":"e_1_3_2_1_24_1","volume-title":"Being the Foundation of Chinese Cognitive Intelligence. CoRR abs\/2209.02970","author":"Zhang Jiaxing","year":"2022","unstructured":"Jiaxing Zhang, Ruyi Gan, Junjie Wang, Yuxiang Zhang, Lin Zhang, Ping Yang, Xinyu Gao, Ziwei Wu, Xiaoqun Dong, Junqing He, Jianheng Zhuo, Qi Yang, Yongfeng Huang, Xiayu Li, Yanghan Wu, Junyu Lu, Xinyu Zhu, Weifeng Chen, Ting Han, Kunhao Pan, Rui Wang, Hao Wang, Xiaojun Wu, Zhongshen Zeng, and Chongpei Chen. 2022. Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence. CoRR abs\/2209.02970 (2022)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00911"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00312"}],"event":{"name":"CSAI 2023: 2023 7th International Conference on Computer Science and Artificial Intelligence","acronym":"CSAI 2023","location":"Beijing China"},"container-title":["Proceedings of the 2023 7th International Conference on Computer Science and Artificial Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3638584.3638635","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3638584.3638635","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T14:58:01Z","timestamp":1755874681000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3638584.3638635"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,8]]},"references-count":26,"alternative-id":["10.1145\/3638584.3638635","10.1145\/3638584"],"URL":"https:\/\/doi.org\/10.1145\/3638584.3638635","relation":{},"subject":[],"published":{"date-parts":[[2023,12,8]]},"assertion":[{"value":"2024-03-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}