{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:10Z","timestamp":1781539030592,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Natural Science Foundation of Shandong Province of China","award":["ZR2024MF086"],"award-info":[{"award-number":["ZR2024MF086"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810620","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"186-195","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DeHub: Learning Hub-Resistant Representations for Text-Video Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-4816-0941","authenticated-orcid":false,"given":"Anjun","family":"Jia","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Qingdao University, Qingdao, Shandong Province, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4854-3736","authenticated-orcid":false,"given":"Xiaowei","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Qingdao University, Qingdao, Shandong Province, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_3_1_3_2","unstructured":"Jimmy\u00a0Lei Ba Jamie\u00a0Ryan Kiros and Geoffrey\u00a0E Hinton. 2016. Layer normalization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1607.06450 (2016)."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00513"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00575"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_3_1_8_2","first-page":"1597","volume-title":"International conference on machine learning","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PmLR, 1597\u20131607."},{"key":"e_1_3_3_1_9_2","unstructured":"Xing Cheng Hezheng Lin Xiangyu Wu Fan Yang and Dong Shen. 2021. Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.04290 (2021)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Neil Chowdhury Franklin Wang Sumedh Shenoy Douwe Kiela Sarah Schwettmann and Tristan Thrush. 2024. Nearest neighbor normalization improves multimodal retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.24114 (2024).","DOI":"10.18653\/v1\/2024.emnlp-main.1257"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00831"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01138"},{"key":"e_1_3_3_1_13_2","unstructured":"Jianfeng Dong Xirong Li Chaoxi Xu Xun Yang Gang Yang Xun Wang and Meng Wang. 2021. Dual encoding for video retrieval by text. IEEE Transactions on Pattern Analysis and Machine Intelligence 44 8 (2021) 4065\u20134080."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01262"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_3_1_16_2","unstructured":"Bin-Bin Gao Yue Zhou Jiangtao Yan Yuezhi Cai Weixi Zhang Meng Wang Jun Liu Yong Liu Lei Wang and Chengjie Wang. 2025. AdaptCLIP: Adapting CLIP for Universal Visual Anomaly Detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.09926 (2025)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01025"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02242"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02440"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00244"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Peng Jin Hao Li Zesen Cheng Jinfa Huang Zhennan Wang Li Yuan Chang Liu and Jie Chen. 2023. Text-video retrieval with disentangled conceptualization and set-to-set alignment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.12218 (2023).","DOI":"10.24963\/ijcai.2023\/104"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00234"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Ian Jolliffe. 2005. Principal component analysis. Encyclopedia of statistics in behavioral science (2005).","DOI":"10.1002\/0470013192.bsa501"},{"key":"e_1_3_3_1_26_2","volume-title":"International conference on learning representations (ICLR)","author":"Kinga Diederik","year":"2015","unstructured":"Diederik Kinga, Jimmy\u00a0Ba Adam, et\u00a0al. 2015. A method for stochastic optimization. In International conference on learning representations (ICLR) , Vol.\u00a05. California;."},{"key":"e_1_3_3_1_27_2","unstructured":"Diederik\u00a0P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1312.6114 (2013)."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.02271"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00379"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00865"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6823"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2025\/643"},{"key":"e_1_3_3_1_33_2","unstructured":"Yang Liu Samuel Albanie Arsha Nagrani and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.13487 (2019)."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01286"},{"key":"e_1_3_3_1_35_2","unstructured":"Ilya Loshchilov and Frank Hutter. 2016. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1608.03983 (2016)."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Huaishao Luo Lei Ji Ming Zhong Yang Chen Wen Lei Nan Duan and Tianrui Li. 2022. Clip4clip: An empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing 508 (2022) 293\u2013304.","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_3_1_39_2","volume-title":"Machine learning: a probabilistic perspective","author":"Murphy Kevin\u00a0P","year":"2012","unstructured":"Kevin\u00a0P Murphy. 2012. Machine learning: a probabilistic perspective. MIT press."},{"key":"e_1_3_3_1_40_2","unstructured":"Seong\u00a0Joon Oh Kevin Murphy Jiyan Pan Joseph Roth Florian Schroff and Andrew Gallagher. 2018. Modeling uncertainty with hedged instance embedding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.00319 (2018)."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3755468"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01430"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-77004-4_1"},{"key":"e_1_3_3_1_44_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_45_2","unstructured":"Milos Radovanovic Alexandros Nanopoulos and Mirjana Ivanovic. 2010. Hubs in space: Popular nearest neighbors in high-dimensional data. Journal of Machine Learning Research 11 sept (2010) 2487\u20132531."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01835"},{"key":"e_1_3_3_1_48_2","unstructured":"Leqi Shen Tianxiang Hao Tao He Sicheng Zhao Yifeng Zhang Pengzhang Liu Yongjun Bao and Guiguang Ding. 2024. Tempme: Video temporal token merging for efficient text-video retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.01156 (2024)."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00700"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_4"},{"key":"e_1_3_3_1_51_2","unstructured":"Nenad Toma\u0161ev. 2014. The role of hubness in high-dimensional data analysis. Informatica 38 4 (2014)."},{"key":"e_1_3_3_1_52_2","first-page":"231","volume-title":"Feature selection for data and pattern recognition","author":"Toma\u0161ev Nenad","year":"2014","unstructured":"Nenad Toma\u0161ev, Krisztian Buza, Krist\u00f3f Marussy, and Piroska\u00a0B Kis. 2014. Hubness-aware classification, instance selection and feature construction: Survey and extensions to time-series. In Feature selection for data and pattern recognition. Springer, 231\u2013262."},{"key":"e_1_3_3_1_53_2","unstructured":"Atousa Torabi Niket Tandon and Leonid Sigal. 2016. Learning language-visual embedding for movie understanding with natural-language. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.08124 (2016)."},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01566"},{"key":"e_1_3_3_1_55_2","unstructured":"Qiang Wang Yanhao Zhang Yun Zheng Pan Pan and Xian-Sheng Hua. 2022. Disentangled representation learning for text-video retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2203.07111 (2022)."},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"crossref","unstructured":"Yimu Wang Xiangru Jian and Bo Xue. 2023. Balance act: Mitigating hubness in cross-modal retrieval with query and gallery banks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.11612 (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.652"},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00264"},{"key":"e_1_3_3_1_59_2","unstructured":"Ruijia Wu Ping Chen Fei Shen Shaoan Zhao Qiang Hui Huanlin Gao Ting Lu Zhaoxiang Liu Fang Zhao Kai Wang et\u00a0al. 2025. HiMo-CLIP: Modeling Semantic Hierarchy and Monotonicity in Vision-Language Alignment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.06653 (2025)."},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32935"},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_3_1_62_2","volume-title":"The Eleventh International Conference on Learning Representations","author":"Xue Hongwei","year":"2023","unstructured":"Hongwei Xue, Yuchong Sun, Bei Liu, Jianlong Fu, Ruihua Song, Houqiang Li, and Jiebo Luo. 2023. Clip-vip: Adapting pre-trained image-text model to video-language alignment. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531950"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:37:00Z","timestamp":1781537820000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810620"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":62,"alternative-id":["10.1145\/3805622.3810620","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810620","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}