{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:29:38Z","timestamp":1776882578455,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,12]],"date-time":"2023-06-12T00:00:00Z","timestamp":1686528000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"PHD foundation of Chongqing Normal University","award":["No.19XLB003"],"award-info":[{"award-number":["No.19XLB003"]}]},{"name":"Science and Technology Research Program of Chongqing Municipal Education","award":["No.KJZD-M202000501"],"award-info":[{"award-number":["No.KJZD-M202000501"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.61772295, 61572270, and 61173056"],"award-info":[{"award-number":["No.61772295, 61572270, and 61173056"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,12]]},"DOI":"10.1145\/3591106.3592242","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T22:33:38Z","timestamp":1686263618000},"page":"499-507","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":16,"title":["Learning From Expert: Vision-Language Knowledge Distillation for Unsupervised Cross-Modal Hashing Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6190-6039","authenticated-orcid":false,"given":"Lina","family":"Sun","sequence":"first","affiliation":[{"name":"School of Computer and Information Science, Chongqing Normal University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8406-0606","authenticated-orcid":false,"given":"Yewen","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer and Information Science, Chongqing Normal University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7890-4427","authenticated-orcid":false,"given":"Yumin","family":"Dong","sequence":"additional","affiliation":[{"name":"School of Computer and Information Science, Chongqing Normal University, China"}]}],"member":"320","published-online":{"date-parts":[[2023,6,12]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372278.3390711"},{"key":"e_1_3_2_1_2_1","volume-title":"Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. arXiv preprint arXiv:2111.02358","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Wenhui Wang, Li Dong, Qiang Liu, Owais\u00a0Khan Mohammed, Kriti Aggarwal, Subhojit Som, and Furu Wei. 2021. Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. arXiv preprint arXiv:2111.02358 (2021)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01065"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475605"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2607421"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3323873.3325045"},{"key":"e_1_3_2_1_9_1","volume-title":"Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_2_1_11_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 2, 7","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, Jeff Dean, 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 2, 7 (2015)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.109891"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00319"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/1460096.1460104"},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Machine Learning. PMLR, 4904\u20134916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904\u20134916."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.348"},{"key":"e_1_3_2_1_17_1","volume-title":"Transformers in vision: A survey. ACM computing surveys (CSUR) 54, 10s","author":"Khan Salman","year":"2022","unstructured":"Salman Khan, Muzammal Naseer, Munawar Hayat, Syed\u00a0Waqas Zamir, Fahad\u00a0Shahbaz Khan, and Mubarak Shah. 2022. Transformers in vision: A survey. ACM computing surveys (CSUR) 54, 10s (2022), 1\u201341."},{"key":"e_1_3_2_1_18_1","volume-title":"International Conference on Machine Learning. PMLR, 5583\u20135594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583\u20135594."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00446"},{"key":"e_1_3_2_1_20_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu\u00a0Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34 (2021), 9694\u20139705."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460426.3463626"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401086"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_26_1","volume-title":"Unsupervised Contrastive Hashing for Cross-Modal Retrieval in Remote Sensing. In ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4463\u20134467","author":"Mikriukov Georgii","year":"2022","unstructured":"Georgii Mikriukov, Mahdyar Ravanbakhsh, and Beg\u00fcm Demir. 2022. Unsupervised Contrastive Hashing for Cross-Modal Retrieval in Remote Sensing. In ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4463\u20134467."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462829"},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_29_1","volume-title":"Attention-Guided Semantic Hashing for Unsupervised Cross-Modal Retrieval. In 2021 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1\u20136.","author":"Shen Xiao","year":"2021","unstructured":"Xiao Shen, Haofeng Zhang, Lunbo Li, and Li Liu. 2021. Attention-Guided Semantic Hashing for Unsupervised Cross-Modal Retrieval. In 2021 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1\u20136."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3172716"},{"key":"e_1_3_2_1_31_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00312"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00145"},{"key":"e_1_3_2_1_34_1","volume-title":"Graph attention networks. stat 1050","author":"Velickovic Petar","year":"2017","unstructured":"Petar Velickovic, Guillem Cucurull, Arantxa Casanova, Adriana Romero, Pietro Lio, and Yoshua Bengio. 2017. Graph attention networks. stat 1050 (2017), 20."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123326"},{"key":"e_1_3_2_1_36_1","volume-title":"J. International Conference on Learning Representations (ICLR","author":"Welling Max","year":"2016","unstructured":"Max Welling and Thomas\u00a0N Kipf. 2016. Semi-supervised classification with graph convolutional networks. In J. International Conference on Learning Representations (ICLR 2017)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557265"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372278.3390673"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i5.16592"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413962"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3053766"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-020-00859-y"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/2600428.2609610"}],"event":{"name":"ICMR '23: International Conference on Multimedia Retrieval","location":"Thessaloniki Greece","acronym":"ICMR '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2023 ACM International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592242","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3591106.3592242","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:51:22Z","timestamp":1750182682000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592242"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,12]]},"references-count":43,"alternative-id":["10.1145\/3591106.3592242","10.1145\/3591106"],"URL":"https:\/\/doi.org\/10.1145\/3591106.3592242","relation":{},"subject":[],"published":{"date-parts":[[2023,6,12]]},"assertion":[{"value":"2023-06-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}