{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T05:21:59Z","timestamp":1772083319420,"version":"3.50.1"},"reference-count":149,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2024,2,18]],"date-time":"2024-02-18T00:00:00Z","timestamp":1708214400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,2,18]],"date-time":"2024-02-18T00:00:00Z","timestamp":1708214400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1007\/s11263-024-02009-7","type":"journal-article","created":{"date-parts":[[2024,2,18]],"date-time":"2024-02-18T07:01:51Z","timestamp":1708239711000},"page":"2765-2797","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Hugs Bring Double Benefits: Unsupervised Cross-Modal Hashing with Multi-granularity Aligned Transformers"],"prefix":"10.1007","volume":"132","author":[{"given":"Jinpeng","family":"Wang","sequence":"first","affiliation":[]},{"given":"Ziyun","family":"Zeng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4798-230X","authenticated-orcid":false,"given":"Bin","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yuting","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Dongliang","family":"Liao","sequence":"additional","affiliation":[]},{"given":"Gongfu","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yiru","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Shu-Tao","family":"Xia","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,2,18]]},"reference":[{"key":"2009_CR1","unstructured":"An, X., Deng, J., Yang, K., Li, J., Feng, Z., Guo, J., Yang, J., & Liu, T. (2023). Unicom: Universal and compact representation learning for image retrieval. In ICLR. OpenReview.net."},{"key":"2009_CR2","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Gron\u00e1t, P., Torii, A., Pajdla, T., & Sivic, J. (2016). Netvlad: CNN architecture for weakly supervised place recognition. In CVPR (pp. 5297\u20135307). IEEE Computer Society.","DOI":"10.1109\/CVPR.2016.572"},{"key":"2009_CR3","doi-asserted-by":"crossref","unstructured":"Asadi, N., & Lin, J. (2013). Effectiveness\/efficiency tradeoffs for candidate generation in multi-stage retrieval architectures. In SIGIR (pp. 997\u20131000). ACM.","DOI":"10.1145\/2484028.2484132"},{"key":"2009_CR4","doi-asserted-by":"crossref","unstructured":"Babenko, A., & Lempitsky, V.\u00a0S. (2014). Additive quantization for extreme vector compression. In CVPR (pp. 931\u2013938). IEEE Computer Society.","DOI":"10.1109\/CVPR.2014.124"},{"key":"2009_CR5","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., & Zisserman, A. (2021). Frozen in time: A joint video and image encoder for end-to-end retrieval. In ICCV, (pp. 1708\u20131718). IEEE.","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"2009_CR6","unstructured":"Bao, H., Wang, W., Dong, L., Liu, Q., Mohammed, O.\u00a0K., Aggarwal, K., Som, S., Piao, S., & Wei, F. (2022b). Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. In NeurIPS."},{"key":"2009_CR7","unstructured":"Bao, H., Dong, L., Piao, S., & Wei, F. (2022). Beit: BERT pre-training of image transformers. In ICLR. OpenReview.net."},{"key":"2009_CR8","unstructured":"Bengio, Y., L\u00e9onard, N., & Courville, A.\u00a0C. (2013). Estimating or propagating gradients through stochastic neurons for conditional computation. CoRR abs\/1308.3432."},{"key":"2009_CR9","doi-asserted-by":"crossref","unstructured":"Cao, Y., Liu, B., Long, M., & Wang, J. (2018). Cross-modal hamming hashing. In ECCV, volume 11205 (pp. 207\u2013223). Springer.","DOI":"10.1007\/978-3-030-01246-5_13"},{"key":"2009_CR10","doi-asserted-by":"crossref","unstructured":"Cao, Y., Long, M., Wang, J., & Liu, S. (2017). Deep visual-semantic quantization for efficient image retrieval. In CVPR (pp. 916\u2013925). IEEE Computer Society.","DOI":"10.1109\/CVPR.2017.104"},{"key":"2009_CR11","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In ECCV, volume 12346 (pp. 213\u2013229). Springer.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2009_CR12","unstructured":"Chen, D.\u00a0L., & Dolan, W.\u00a0B. (2011). Collecting highly parallel data for paraphrase evaluation. In ACL (pp. 190\u2013200). The Association for Computer Linguistics."},{"key":"2009_CR13","unstructured":"Chen, T., Kornblith, S., Norouzi, M., & Hinton, G.\u00a0E. (2020a). A simple framework for contrastive learning of visual representations. In ICML, volume 119 (pp. 1597\u20131607). PMLR."},{"key":"2009_CR14","unstructured":"Chen, T., Li, L., & Sun, Y. (2020b). Differentiable product quantization for end-to-end embedding compression. In ICML, volume 119 (pp. 1617\u20131626). PMLR."},{"key":"2009_CR15","doi-asserted-by":"crossref","unstructured":"Chen, Y., Wang, S., Lu, J., Chen, Z., Zhang, Z., & Huang, Z. (2021). Local graph convolutional networks for cross-modal hashing. In ACM Multimedia (pp. 1921\u20131928). ACM.","DOI":"10.1145\/3474085.3475346"},{"key":"2009_CR16","doi-asserted-by":"crossref","unstructured":"Chen, Z., Yu, W., Li, C., Nie, L., & Xu, X. (2018). Dual deep neural networks cross-modal hashing. In AAAI (pp. 274\u2013281). AAAI.","DOI":"10.1609\/aaai.v32i1.11249"},{"key":"2009_CR17","doi-asserted-by":"crossref","unstructured":"Chen, Y., Zhang, S., Liu, F., Chang, Z., Ye, M., & Qi, Z. (2022). Transhash: Transformer-based hamming hashing for efficient image retrieval. In ICMR (pp. 127\u2013136). ACM.","DOI":"10.1145\/3512527.3531405"},{"key":"2009_CR18","doi-asserted-by":"crossref","unstructured":"Chua, T., Tang, J., Hong, R., Li, H., Luo, Z., & Zheng, Y. (2009). NUS-WIDE: A real-world web image database from national university of singapore. In CIVR. ACM.","DOI":"10.1145\/1646396.1646452"},{"key":"2009_CR19","doi-asserted-by":"crossref","unstructured":"Cui, H., Zhu, L., Li, J., Cheng, Z., & Zhang, Z. (2021). Two-pronged strategy: Lightweight augmented graph network hashing for scalable image retrieval. In ACM Multimedia (pp. 1432\u20131440). ACM.","DOI":"10.1145\/3474085.3475605"},{"key":"2009_CR20","doi-asserted-by":"crossref","unstructured":"Datar, M., Immorlica, N., Indyk, P., & Mirrokni, V.\u00a0S. (2004). Locality-sensitive hashing scheme based on p-stable distributions. In SCG (pp. 253\u2013262). ACM.","DOI":"10.1145\/997817.997857"},{"key":"2009_CR21","unstructured":"Devlin, J., Chang, M., Lee, K., & Toutanova, K. (2019). BERT: pre-training of deep bidirectional transformers for language understanding. In NAACL (pp. 4171\u20134186). Association for Computational Linguistics."},{"issue":"11","key":"2009_CR22","doi-asserted-by":"publisher","first-page":"5427","DOI":"10.1109\/TIP.2016.2607421","volume":"25","author":"G Ding","year":"2016","unstructured":"Ding, G., Guo, Y., Zhou, J., & Gao, Y. (2016). Large-scale cross-modality search via collective matrix factorization hashing. IEEE Transactions on Image Processing, 25(11), 5427\u20135440.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2009_CR23","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., & Houlsby, N. (2021). An image is worth 16x16 words: Transformers for image recognition at scale. In ICLR. OpenReview.net."},{"key":"2009_CR24","doi-asserted-by":"crossref","unstructured":"Dubey, S.\u00a0R., Singh, S.\u00a0K., & Chu, W. (2022). Vision transformer hashing for image retrieval. In ICME (pp. 1\u20136). IEEE.","DOI":"10.1109\/ICME52920.2022.9859900"},{"key":"2009_CR25","unstructured":"Ester, M., Kriegel, H., Sander, J., & Xu, X. (1996). A density-based algorithm for discovering clusters in large spatial databases with noise. In KDD (pp. 226\u2013231). AAAI."},{"key":"2009_CR26","doi-asserted-by":"crossref","unstructured":"Fang, B., Wu, W., Liu, C., Zhou, Y., Song, Y., Wang, W., Shu, X., Ji, X., & Wang, J. (2023). UATVR: Uncertainty-adaptive text-video retrieval. In ICCV (pp. 13677\u201313687). IEEE.","DOI":"10.1109\/ICCV51070.2023.01262"},{"key":"2009_CR27","doi-asserted-by":"crossref","unstructured":"Gabeur, V., Sun, C., Alahari, K., & Schmid, C. (2020). Multi-modal transformer for video retrieval. In ECCV, volume 12349 (pp. 214\u2013229). Springer.","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"2009_CR28","doi-asserted-by":"crossref","unstructured":"Gao, D., Jin, L., Chen, B., Qiu, M., Li, P., Wei, Y., Hu, Y., & Wang, H. (2020). Fashionbert: Text and image matching with adaptive loss for cross-modal retrieval. In SIGIR (pp. 2251\u20132260). ACM.","DOI":"10.1145\/3397271.3401430"},{"key":"2009_CR29","doi-asserted-by":"crossref","unstructured":"Ge, T., He, K., Ke, Q., & Sun, J. (2013). Optimized product quantization for approximate nearest neighbor search. In CVPR (pp. 2946\u20132953). IEEE Computer Society.","DOI":"10.1109\/CVPR.2013.379"},{"issue":"12","key":"2009_CR30","doi-asserted-by":"publisher","first-page":"2916","DOI":"10.1109\/TPAMI.2012.193","volume":"35","author":"Y Gong","year":"2013","unstructured":"Gong, Y., Lazebnik, S., Gordo, A., & Perronnin, F. (2013). Iterative quantization: A procrustean approach to learning binary codes for large-scale image retrieval. IEEE Transactions on Pattern Analysis and Machine Intelligence, 35(12), 2916\u20132929.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2009_CR31","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., & Girshick, R.\u00a0B. (2020). Momentum contrast for unsupervised visual representation learning. In CVPR (pp. 9726\u20139735). Computer Vision Foundation\/IEEE.","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"2009_CR32","unstructured":"He, X., Pan, Y., Tang, M., & Lv, Y. (2021). Self-supervised video retrieval transformer network. CoRR, abs\/2104.07993."},{"key":"2009_CR33","unstructured":"Heo, J., Lee, Y., He, J., Chang, S., & Yoon, S. (2012). Spherical hashing. In CVPR (pp. 2957\u20132964). IEEE Computer Society."},{"key":"2009_CR34","doi-asserted-by":"publisher","first-page":"8391","DOI":"10.1109\/TIP.2020.3014727","volume":"29","author":"T Hoang","year":"2020","unstructured":"Hoang, T., Do, T., Nguyen, T. V., & Cheung, N. (2020). Unsupervised deep cross-modality spectral hashing. IEEE Transactions on Image Processing, 29, 8391\u20138406.","journal-title":"IEEE Transactions on Image Processing"},{"issue":"9","key":"2009_CR35","doi-asserted-by":"publisher","first-page":"6289","DOI":"10.1109\/TNNLS.2021.3135420","volume":"34","author":"T Hoang","year":"2023","unstructured":"Hoang, T., Do, T., Nguyen, T. V., & Cheung, N. (2023). Multimodal mutual information maximization: A novel approach for unsupervised deep cross-modal hashing. IEEE Trans. Neural Networks Learn. Syst., 34(9), 6289\u20136302.","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"key":"2009_CR36","doi-asserted-by":"crossref","unstructured":"Hu, H., Xie, L., Hong, R., & Tian, Q. (2020). Creating something from nothing: Unsupervised knowledge distillation for cross-modal hashing. In CVPR, (pp. 3120\u20133129). Computer Vision Foundation\/IEEE.","DOI":"10.1109\/CVPR42600.2020.00319"},{"key":"2009_CR37","doi-asserted-by":"crossref","unstructured":"Huiskes, M.\u00a0J., & Lew, M.\u00a0S. (2008). The MIR flickr retrieval evaluation. In Multimedia Information Retrieval (pp. 39\u201343). ACM.","DOI":"10.1145\/1460096.1460104"},{"issue":"7","key":"2009_CR38","doi-asserted-by":"publisher","first-page":"1811","DOI":"10.1007\/s11263-022-01615-7","volume":"130","author":"M Humenberger","year":"2022","unstructured":"Humenberger, M., Cabon, Y., Pion, N., Weinzaepfel, P., Lee, D., Gu\u00e9rin, N., Sattler, T., & Csurka, G. (2022). Investigating the role of image retrieval for visual localization: An exhaustive benchmark. International Journal of Computer Vision, 130(7), 1811\u20131836.","journal-title":"International Journal of Computer Vision"},{"issue":"4","key":"2009_CR39","doi-asserted-by":"publisher","first-page":"973","DOI":"10.1109\/TMM.2018.2866771","volume":"21","author":"D Hu","year":"2019","unstructured":"Hu, D., Nie, F., & Li, X. (2019). Deep binary reconstruction for cross-modal hashing. IEEE Transactions on Multimedia, 21(4), 973\u2013985.","journal-title":"IEEE Transactions on Multimedia"},{"issue":"6","key":"2009_CR40","doi-asserted-by":"publisher","first-page":"2770","DOI":"10.1109\/TIP.2018.2890144","volume":"28","author":"M Hu","year":"2019","unstructured":"Hu, M., Yang, Y., Shen, F., Xie, N., Hong, R., & Shen, H. T. (2019). Collective reconstructive embeddings for cross-modal hashing. IEEE Transactions on Image Processing, 28(6), 2770\u20132784.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2009_CR41","unstructured":"Ioffe, S., & Szegedy, C. (2015). Batch normalization: Accelerating deep network training by reducing internal covariate shift. In ICML, volume\u00a037 (pp. 448\u2013456). JMLR.org."},{"key":"2009_CR42","doi-asserted-by":"crossref","unstructured":"Irie, G., Arai, H., & Taniguchi, Y. (2015). Alternating co-quantization for cross-modal hashing. In ICCV (pp. 1886\u20131894). IEEE Computer Society.","DOI":"10.1109\/ICCV.2015.219"},{"key":"2009_CR43","doi-asserted-by":"crossref","unstructured":"Jang, Y.\u00a0K., Cho, N.\u00a0I. (2021). Self-supervised product quantization for deep unsupervised image retrieval. In ICCV (pp. 12065\u201312074). IEEE.","DOI":"10.1109\/ICCV48922.2021.01187"},{"issue":"1","key":"2009_CR44","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1109\/TPAMI.2010.57","volume":"33","author":"H J\u00e9gou","year":"2011","unstructured":"J\u00e9gou, H., Douze, M., & Schmid, C. (2011). Product quantization for nearest neighbor search. IEEE Transactions on Pattern Analysis and Machine Intelligence, 33(1), 117\u2013128.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2009_CR45","doi-asserted-by":"crossref","unstructured":"Jiang, Q., Li, W. (2017). Deep cross-modal hashing. In CVPR (pp. 3270\u20133278). IEEE Computer Society.","DOI":"10.1109\/CVPR.2017.348"},{"key":"2009_CR46","doi-asserted-by":"crossref","unstructured":"Jin, P., Huang, J., Xiong, P., Tian, S., Liu, C., Ji, X., Yuan, L., & Chen, J. (2023a). Video-text as game players: Hierarchical banzhaf interaction for cross-modal representation learning. In CVPR (pp. 2472\u20132482). IEEE.","DOI":"10.1109\/CVPR52729.2023.00244"},{"key":"2009_CR47","doi-asserted-by":"crossref","unstructured":"Jin, P., Li, H., Cheng, Z., Li, K., Ji, X., Liu, C., Yuan, L., & Chen, J. (2023b). Diffusionret: Generative text-video retrieval with diffusion model. In ICCV (pp. 2470\u20132481). IEEE.","DOI":"10.1109\/ICCV51070.2023.00234"},{"issue":"3","key":"2009_CR48","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1109\/TBDATA.2019.2921572","volume":"7","author":"J Johnson","year":"2021","unstructured":"Johnson, J., Douze, M., & J\u00e9gou, H. (2021). Billion-scale similarity search with gpus. IEEE Trans. Big Data, 7(3), 535\u2013547.","journal-title":"IEEE Trans. Big Data"},{"key":"2009_CR49","doi-asserted-by":"crossref","unstructured":"Kalantidis, Y., & Avrithis, Y. (2014). Locally optimized product quantization for approximate nearest neighbor search. In CVPR (pp. 2329\u20132336). IEEE Computer Society.","DOI":"10.1109\/CVPR.2014.298"},{"key":"2009_CR50","unstructured":"Kingma, D.\u00a0P., & Ba, J. (2015). Adam: A method for stochastic optimization. In ICLR."},{"key":"2009_CR51","doi-asserted-by":"crossref","unstructured":"Klein, B.\u00a0E., Wolf, L. (2019). End-to-end supervised product quantization for image search and retrieval. In CVPR (pp. 5041\u20135050). Computer Vision Foundation \/ IEEE.","DOI":"10.1109\/CVPR.2019.00518"},{"key":"2009_CR52","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G.\u00a0E. (2012). Imagenet classification with deep convolutional neural networks. In NeurIPS, (pp. 1106\u20131114)."},{"key":"2009_CR53","unstructured":"Kumar, S., & Udupa, R. (2011). Learning hash functions for cross-view similarity search. In IJCAI (pp. 1360\u20131365). IJCAI\/AAAI."},{"key":"2009_CR54","unstructured":"Le, Q.\u00a0V., Mikolov, T. (2014). Distributed representations of sentences and documents. In ICML, volume\u00a032 (pp. 1188\u20131196). JMLR.org."},{"key":"2009_CR55","doi-asserted-by":"crossref","unstructured":"Li, M., & Wang, H. (2021). Unsupervised deep cross-modal hashing by knowledge distillation for large-scale cross-modal retrieval. In ICMR (pp. 183\u2013191). ACM.","DOI":"10.1145\/3460426.3463626"},{"key":"2009_CR56","doi-asserted-by":"crossref","unstructured":"Li, C., Deng, C., Wang, L., Xie, D., & Liu, X. (2019). Coupled cyclegan: Unsupervised hashing network for cross-modal retrieval. In AAAI (pp. 176\u2013183). AAAI.","DOI":"10.1609\/aaai.v33i01.3301176"},{"key":"2009_CR57","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., & Jiang, D. (2020a). Unicoder-vl: A universal encoder for vision and language by cross-modal pre-training. In AAAI (pp. 11336\u201311344). AAAI.","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"2009_CR58","doi-asserted-by":"crossref","unstructured":"Li, S., Li, X., Lu, J., & Zhou, J. (2021b). Self-supervised video hashing via bidirectional transformers. In CVPR (pp. 13549\u201313558). Computer Vision Foundation \/ IEEE.","DOI":"10.1109\/CVPR46437.2021.01334"},{"key":"2009_CR59","doi-asserted-by":"crossref","unstructured":"Li, P., Xie, H., Ge, J., Zhang, L., Min, S., & Zhang, Y. (2022a). Dual-stream knowledge-preserving hashing for unsupervised video retrieval. In ECCV, volume 13674 (pp. 181\u2013197). Springer.","DOI":"10.1007\/978-3-031-19781-9_11"},{"key":"2009_CR60","doi-asserted-by":"crossref","unstructured":"Lin, T., Maire, M., Belongie, S.\u00a0J., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.\u00a0L. (2014). Microsoft COCO: Common objects in context. In ECCV, volume 8693 (pp. 740\u2013755). Springer.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2009_CR61","doi-asserted-by":"crossref","unstructured":"Lin, X., Tiwari, S., Huang, S., Li, M., Shou, M.\u00a0Z., Ji, H., & Chang, S. (2023). Towards fast adaptation of pretrained contrastive models for multi-channel video-language retrieval. In CVPR (pp. 14846\u201314855). IEEE.","DOI":"10.1109\/CVPR52729.2023.01426"},{"key":"2009_CR62","doi-asserted-by":"crossref","unstructured":"Liong, V.\u00a0E., Lu, J., Wang, G., Moulin, P., & Zhou, J. (2015). Deep hashing for compact binary codes learning. In CVPR (pp. 2475\u20132483). IEEE Computer Society.","DOI":"10.1109\/CVPR.2015.7298862"},{"issue":"8","key":"2009_CR63","doi-asserted-by":"publisher","first-page":"2204","DOI":"10.1007\/s11263-020-01327-w","volume":"128","author":"Q Li","year":"2020","unstructured":"Li, Q., Sun, Z., He, R., & Tan, T. (2020). A general framework for deep supervised discrete hashing. International Journal of Computer Vision, 128(8), 2204\u20132222.","journal-title":"International Journal of Computer Vision"},{"key":"2009_CR64","unstructured":"Liu, Y., Albanie, S., Nagrani, A., and Zisserman, A. (2019b). Use what you have: Video retrieval using representations from collaborative experts. In BMVC, (p. 279). BMVA."},{"key":"2009_CR65","doi-asserted-by":"crossref","unstructured":"Liu, S., Fan, H., Qian, S., Chen, Y., Ding, W., and Wang, Z. (2021a). Hit: Hierarchical transformer with momentum contrast for video-text retrieval. In ICCV (pp. 11895\u201311905). IEEE.","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"2009_CR66","doi-asserted-by":"crossref","unstructured":"Liu, H., Ji, R., Wu, Y., Huang, F., & Zhang, B. (2017). Cross-modality binary code learning via fusion similarity hashing. In CVPR (pp. 6345\u20136353). IEEE Computer Society.","DOI":"10.1109\/CVPR.2017.672"},{"key":"2009_CR67","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., and Guo, B. (2021b). Swin transformer: Hierarchical vision transformer using shifted windows. In ICCV (pp. 9992\u201310002). IEEE.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2009_CR68","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., and Stoyanov, V. (2019c). Roberta: A robustly optimized BERT pretraining approach. CoRR, abs\/1907.11692."},{"key":"2009_CR69","doi-asserted-by":"crossref","unstructured":"Liu, S., Qian, S., Guan, Y., Zhan, J., & Ying, L. (2020). Joint-modal distribution-based similarity hashing for large-scale unsupervised deep cross-modal retrieval. In SIGIR (pp. 1379\u20131388). ACM.","DOI":"10.1145\/3397271.3401086"},{"issue":"3","key":"2009_CR70","doi-asserted-by":"publisher","first-page":"225","DOI":"10.1561\/1500000016","volume":"3","author":"T Liu","year":"2009","unstructured":"Liu, T. (2009). Learning to rank for information retrieval. Foundations and Trends in Information Retrieval, 3(3), 225\u2013331.","journal-title":"Foundations and Trends in Information Retrieval"},{"issue":"9","key":"2009_CR71","doi-asserted-by":"publisher","first-page":"1217","DOI":"10.1007\/s11263-019-01174-4","volume":"127","author":"H Liu","year":"2019","unstructured":"Liu, H., Wang, R., Shan, S., & Chen, X. (2019). Deep supervised hashing for fast image retrieval. International Journal of Computer Vision, 127(9), 1217\u20131234.","journal-title":"International Journal of Computer Vision"},{"key":"2009_CR72","unstructured":"Liu, Z., Xiong, C., Lv, Y., Liu, Z., & Yu, G. (2023). Universal vision-language dense retrieval: Learning A unified representation space for multi-modal retrieval. In ICLR. OpenReview.net."},{"key":"2009_CR73","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.106851","volume":"219","author":"F Li","year":"2021","unstructured":"Li, F., Wang, T., Zhu, L., Zhang, Z., & Wang, X. (2021). Task-adaptive asymmetric deep cross-modal hashing. Knowl. Based Syst., 219, 106851.","journal-title":"Knowl. Based Syst."},{"key":"2009_CR74","doi-asserted-by":"publisher","first-page":"827","DOI":"10.1109\/LSP.2022.3157517","volume":"29","author":"T Li","year":"2022","unstructured":"Li, T., Zhang, Z., Pei, L., & Gan, Y. (2022). Hashformer: Vision transformer based deep hashing for image retrieval. IEEE Signal Processing Letters, 29, 827\u2013831.","journal-title":"IEEE Signal Processing Letters"},{"issue":"2","key":"2009_CR75","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"DG Lowe","year":"2004","unstructured":"Lowe, D. G. (2004). Distinctive image features from scale-invariant keypoints. International Journal of Computer Vision, 60(2), 91\u2013110.","journal-title":"International Journal of Computer Vision"},{"key":"2009_CR76","unstructured":"Lu, J., Batra, D., Parikh, D., & Lee, S. (2019). Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In NeurIPS (pp. 13\u201323)."},{"key":"2009_CR77","unstructured":"Lu, D., Wang, J., Zeng, Z., Chen, B., Wu, S., & Xia, S. (2021). Swinfghash: Fine-grained image retrieval via transformer-based hashing network. In BMVC (p. 432). BMVA."},{"key":"2009_CR78","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., Ji, L., Zhong, M., Chen, Y., Lei, W., Duan, N., & Li, T. (2022). Clip4clip: An empirical study of CLIP for end to end video clip retrieval and captioning. Neurocomputing, 508, 293\u2013304.","journal-title":"Neurocomputing"},{"key":"2009_CR79","doi-asserted-by":"crossref","unstructured":"Martinez, J., Clement, J., Hoos, H.\u00a0H., & Little, J.\u00a0J. (2016). Revisiting additive quantization. In ECCV, volume 9906 (pp. 137\u2013153). Springer.","DOI":"10.1007\/978-3-319-46475-6_9"},{"key":"2009_CR80","doi-asserted-by":"crossref","unstructured":"Messina, N., Amato, G., Esuli, A., Falchi, F., Gennaro, C., & Marchand-Maillet, S. (2021). Fine-grained visual textual alignment for cross-modal retrieval using transformer encoders. ACM Trans. Multim. Comput. Commun. Appl., 17(4):128:1\u2013128:23.","DOI":"10.1145\/3451390"},{"key":"2009_CR81","doi-asserted-by":"crossref","unstructured":"Mikriukov, G., Ravanbakhsh, M., & Demir, B. (2022). Unsupervised contrastive hashing for cross-modal retrieval in remote sensing. In ICASSP (pp. 4463\u20134467). IEEE.","DOI":"10.1109\/ICASSP43922.2022.9746251"},{"key":"2009_CR82","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., Desmaison, A., K\u00f6pf, A., Yang, E.\u00a0Z., DeVito, Z., Raison, M., Tejani, A., Chilamkurthy, S., Steiner, B., Fang, L., Bai, J., & Chintala, S. (2019). Pytorch: An imperative style, high-performance deep learning library. In NeurIPS (pp. 8024\u20138035)."},{"key":"2009_CR83","unstructured":"Patrick, M., Huang, P., Asano, Y. M., Metze, F., Hauptmann, A. G., Henriques, J. F., & Vedaldi, A. (2021). Support-set bottlenecks for video-text representation learning. In ICLR. OpenReview.net."},{"key":"2009_CR84","doi-asserted-by":"publisher","first-page":"2989","DOI":"10.1109\/TIP.2020.3048680","volume":"30","author":"M Qi","year":"2021","unstructured":"Qi, M., Qin, J., Yang, Y., Wang, Y., & Luo, J. (2021). Semantics-aware spatial-temporal binaries for cross-modal video retrieval. IEEE Transactions on Image Processing, 30, 2989\u20133004.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2009_CR85","doi-asserted-by":"crossref","unstructured":"Radenovic, F., Dubey, A., Kadian, A., Mihaylov, T., Vandenhende, S., Patel, Y., Wen, Y., Ramanathan, V., & Mahajan, D. (2023). Filtering, distillation, and hard negatives for vision-language pre-training. In CVPR (pp. 6967\u20136977). IEEE.","DOI":"10.1109\/CVPR52729.2023.00673"},{"key":"2009_CR86","unstructured":"Radford, A., Kim, J.\u00a0W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. In ICML, volume 139 (pp. 8748\u20138763). PMLR."},{"key":"2009_CR87","doi-asserted-by":"crossref","unstructured":"Rasiwasia, N., Pereira, J.\u00a0C., Coviello, E., Doyle, G., Lanckriet, G. R.\u00a0G., Levy, R., & Vasconcelos, N. (2010). A new approach to cross-modal multimedia retrieval. In ACM Multimedia (pp. 251\u2013260). ACM.","DOI":"10.1145\/1873951.1873987"},{"key":"2009_CR88","unstructured":"Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). Distilbert, a distilled version of BERT: smaller, faster, cheaper and lighter. CoRR, abs\/1910.01108."},{"key":"2009_CR89","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.\u00a0R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., & Batra, D. (2017). Grad-cam: Visual explanations from deep networks via gradient-based localization. In ICCV (pp. 618\u2013626). IEEE Computer Society.","DOI":"10.1109\/ICCV.2017.74"},{"issue":"11\u201312","key":"2009_CR90","doi-asserted-by":"publisher","first-page":"1614","DOI":"10.1007\/s11263-019-01166-4","volume":"127","author":"Y Shen","year":"2019","unstructured":"Shen, Y., Liu, L., & Shao, L. (2019). Unsupervised binary representation learning with deep variational networks. International Journal of Computer Vision, 127(11\u201312), 1614\u20131628.","journal-title":"International Journal of Computer Vision"},{"issue":"10","key":"2009_CR91","doi-asserted-by":"publisher","first-page":"3351","DOI":"10.1109\/TKDE.2020.2970050","volume":"33","author":"HT Shen","year":"2021","unstructured":"Shen, H. T., Liu, L., Yang, Y., Xu, X., Huang, Z., Shen, F., & Hong, R. (2021). Exploiting subspace relation in semantic labels for cross-modal hashing. IEEE Transactions on Knowledge and Data Engineering, 33(10), 3351\u20133365.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"2009_CR92","unstructured":"Shi, Y., Chung, Y. (2021). Efficient cross-modal retrieval via deep binary hashing and quantization. In BMVC (p. 409). BMVA."},{"issue":"2","key":"2009_CR93","doi-asserted-by":"publisher","first-page":"435","DOI":"10.1007\/s11263-021-01547-8","volume":"130","author":"A Shin","year":"2022","unstructured":"Shin, A., Ishii, M., & Narihira, T. (2022). Perspectives and prospects on transformer architecture for cross-modal tasks with language and vision. International Journal of Computer Vision, 130(2), 435\u2013454.","journal-title":"International Journal of Computer Vision"},{"key":"2009_CR94","unstructured":"Simonyan, K., & Zisserman, A. (2015). Very deep convolutional networks for large-scale image recognition. In ICLR."},{"key":"2009_CR95","doi-asserted-by":"crossref","unstructured":"Song, Y., & Soleymani, M. (2019). Polysemous visual-semantic embedding for cross-modal retrieval. In CVPR (pp. 1979\u20131988). Computer Vision Foundation\/IEEE.","DOI":"10.1109\/CVPR.2019.00208"},{"key":"2009_CR96","doi-asserted-by":"crossref","unstructured":"Song, J., Yang, Y., Yang, Y., Huang, Z., & Shen, H.\u00a0T. (2013). Inter-media hashing for large-scale retrieval from heterogeneous data sources. In SIGMOD (pp. 785\u2013796). ACM.","DOI":"10.1145\/2463676.2465274"},{"issue":"8","key":"2009_CR97","doi-asserted-by":"publisher","first-page":"2243","DOI":"10.1007\/s11263-020-01305-2","volume":"128","author":"J Song","year":"2020","unstructured":"Song, J., He, T., Gao, L., Xu, X., Hanjalic, A., & Shen, H. T. (2020). Unified binary generative adversarial network for image retrieval and compression. International Journal of Computer Vision, 128(8), 2243\u20132264.","journal-title":"International Journal of Computer Vision"},{"key":"2009_CR98","doi-asserted-by":"crossref","unstructured":"Su, S., Zhong, Z., & Zhang, C. (2019). Deep joint-semantics reconstructing hashing for large-scale unsupervised cross-modal retrieval. In ICCV (pp. 3027\u20133035). IEEE.","DOI":"10.1109\/ICCV.2019.00312"},{"key":"2009_CR99","doi-asserted-by":"crossref","unstructured":"Sun, C., Latapie, H., Liu, G., & Yan, Y. (2022). Deep normalized cross-modal hashing with bi-direction relation reasoning. In CVPRW (pp. 4937\u20134945). IEEE.","DOI":"10.1109\/CVPRW56347.2022.00541"},{"key":"2009_CR100","doi-asserted-by":"crossref","unstructured":"Sun, C., Song, X., Feng, F., Zhao, W.\u00a0X., Zhang, H., & Nie, L. (2019). Supervised hierarchical cross-modal hashing. In SIGIR (pp. 725\u2013734). ACM.","DOI":"10.1145\/3331184.3331229"},{"key":"2009_CR101","doi-asserted-by":"crossref","unstructured":"Tan, W., Zhu, L., Guan, W., Li, J., & Cheng, Z. (2022). Bit-aware semantic transformer hashing for multi-modal retrieval. In SIGIR (pp. 982\u2013991). ACM.","DOI":"10.1145\/3477495.3531947"},{"key":"2009_CR102","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., & J\u00e9gou, H. (2021). Training data-efficient image transformers & distillation through attention. In ICML, volume 139 (pp. 10347\u201310357). PMLR."},{"key":"2009_CR103","doi-asserted-by":"crossref","unstructured":"Tu, J., Liu, X., Lin, Z., Hong, R., & Wang, M. (2022). Differentiable cross-modal hashing via multimodal transformers. In ACM Multimedia (pp. 453\u2013461). ACM.","DOI":"10.1145\/3503161.3548187"},{"key":"2009_CR104","doi-asserted-by":"publisher","first-page":"8946","DOI":"10.1109\/TMM.2023.3243608","volume":"25","author":"R Tu","year":"2023","unstructured":"Tu, R., Mao, X., Lin, Q., Ji, W., Qin, W., Wei, W., & Huang, H. (2023). Unsupervised cross-modal hashing via semantic text mining. IEEE Transactions of Multimedia, 25, 8946\u20138957.","journal-title":"IEEE Transactions of Multimedia"},{"key":"2009_CR105","unstructured":"van den Oord, A., Li, Y., & Vinyals, O. (2018). Representation learning with contrastive predictive coding. CoRR, abs\/1807.03748."},{"key":"2009_CR106","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.\u00a0N., Kaiser, L., & Polosukhin, I. (2017). Attention is all you need. In NeurIPS (pp. 5998\u20136008)."},{"key":"2009_CR107","doi-asserted-by":"crossref","unstructured":"Wang, W., Shen, Y., Zhang, H., Yao, Y., & Liu, L. (2020b). Set and rebase: Determining the semantic graph connectivity for unsupervised cross-modal hashing. In IJCAI (pp. 853\u2013859). ijcai.org.","DOI":"10.24963\/ijcai.2020\/119"},{"key":"2009_CR108","doi-asserted-by":"crossref","unstructured":"Wang, Y., Wang, J., Chen, B., Zeng, Z., & Xia, S. (2023). Contrastive masked autoencoders for self-supervised video hashing. In AAAI (pp. 2733\u20132741). AAAI.","DOI":"10.1609\/aaai.v37i3.25373"},{"key":"2009_CR109","doi-asserted-by":"crossref","unstructured":"Wang, J., Zeng, Z., Chen, B., Dai, T., & Xia, S. (2022a). Contrastive quantization with code memory for unsupervised image retrieval. In AAAI, (pp. 2468\u20132476). AAAI.","DOI":"10.1609\/aaai.v36i3.20147"},{"key":"2009_CR110","unstructured":"Wang, J., Zeng, Z., Chen, B., Wang, Y., Liao, D., Li, G., Wang, Y., & Xia, S. (2022b). Hugs are better than handshakes: Unsupervised cross-modal transformer hashing with multi-granularity alignment. In BMVC (p. 1035). BMVA."},{"key":"2009_CR111","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhu, L., and Yang, Y. (2021b). T2VLAD: global-local sequence alignment for text-video retrieval. In CVPR (pp. 5079\u20135088). Computer Vision Foundation\/IEEE.","DOI":"10.1109\/CVPR46437.2021.00504"},{"issue":"1","key":"2009_CR112","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1109\/JPROC.2015.2487976","volume":"104","author":"J Wang","year":"2016","unstructured":"Wang, J., Liu, W., Kumar, S., & Chang, S. (2016). Learning to hash for indexing big data - A survey. Proceedings of the IEEE, 104(1), 34\u201357.","journal-title":"Proceedings of the IEEE"},{"key":"2009_CR113","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107732","volume":"111","author":"L Wang","year":"2021","unstructured":"Wang, L., Yang, J., Zareapoor, M., & Zheng, Z. (2021). Cluster-wise unsupervised hashing for cross-modal similarity search. Pattern Recognition, 111, 107732.","journal-title":"Pattern Recognition"},{"key":"2009_CR114","doi-asserted-by":"publisher","first-page":"1274","DOI":"10.1109\/TMM.2020.2995267","volume":"23","author":"Z Wang","year":"2021","unstructured":"Wang, Z., Zhang, Z., Luo, Y., Huang, Z., & Shen, H. T. (2021). Deep collaborative discrete hashing with semantic-invariant structure construction. IEEE Transactions of Multimedia, 23, 1274\u20131286.","journal-title":"IEEE Transactions of Multimedia"},{"issue":"4","key":"2009_CR115","doi-asserted-by":"publisher","first-page":"769","DOI":"10.1109\/TPAMI.2017.2699960","volume":"40","author":"J Wang","year":"2018","unstructured":"Wang, J., Zhang, T., Song, J., Sebe, N., & Shen, H. T. (2018). A survey on learning to hash. IEEE Transactions on Pattern Analysis and Machine Intelligence, 40(4), 769\u2013790.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2009_CR116","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1016\/j.neucom.2019.12.058","volume":"386","author":"T Wang","year":"2020","unstructured":"Wang, T., Zhu, L., Cheng, Z., Li, J., & Gao, Z. (2020). Unsupervised deep cross-modal hashing with virtual label regression. Neurocomputing, 386, 84\u201396.","journal-title":"Neurocomputing"},{"key":"2009_CR117","unstructured":"Weiss, Y., Torralba, A., & Fergus, R. (2008). Spectral hashing. In NeurIPS (pp. 1753\u20131760). Curran Associates, Inc."},{"key":"2009_CR118","doi-asserted-by":"crossref","unstructured":"Wu, G., Lin, Z., Han, J., Liu, L., Ding, G., Zhang, B., & Shen, J. (2018). Unsupervised deep hashing via binary latent factor models for large-scale cross-modal retrieval. In IJCAI (pp. 2854\u20132860). ijcai.org.","DOI":"10.24963\/ijcai.2018\/396"},{"key":"2009_CR119","doi-asserted-by":"crossref","unstructured":"Wu, W., Luo, H., Fang, B., Wang, J., & Ouyang, W. (2023). Cap4video: What can auxiliary captions do for text-video retrieval? In CVPR (pp. 10704\u201310713). IEEE.","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"2009_CR120","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., & Rui, Y. (2016). MSR-VTT: A large video description dataset for bridging video and language. In CVPR (pp. 5288\u20135296). IEEE Computer Society.","DOI":"10.1109\/CVPR.2016.571"},{"key":"2009_CR121","doi-asserted-by":"crossref","unstructured":"Yang, J., Bisk, Y., Gao, J. (2021). Taco: Token-aware cascade contrastive learning for video-text alignment. In ICCV (pp. 11542\u201311552). IEEE.","DOI":"10.1109\/ICCV48922.2021.01136"},{"key":"2009_CR122","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J.\u00a0G., Salakhutdinov, R., & Le, Q.\u00a0V. (2019). Xlnet: Generalized autoregressive pretraining for language understanding. In NeurIPS (pp. 5754\u20135764)."},{"key":"2009_CR123","doi-asserted-by":"crossref","unstructured":"Yang, D., Wu, D., Zhang, W., Zhang, H., Li, B., & Wang, W. (2020). Deep semantic-alignment hashing for unsupervised cross-modal retrieval. In ICMR (pp. 44\u201352).","DOI":"10.1145\/3372278.3390673"},{"key":"2009_CR124","unstructured":"Yao, L., Huang, R., Hou, L., Lu, G., Niu, M., Xu, H., Liang, X., Li, Z., Jiang, X., & Xu, C. (2022). FILIP: Fine-grained interactive language-image pre-training. In ICLR. OpenReview.net."},{"issue":"6","key":"2009_CR125","doi-asserted-by":"publisher","first-page":"2872","DOI":"10.1109\/TPAMI.2021.3054775","volume":"44","author":"M Ye","year":"2022","unstructured":"Ye, M., Shen, J., Lin, G., Xiang, T., Shao, L., & Hoi, S. C. H. (2022). Deep learning for person re-identification: A survey and outlook. IEEE Transactions on Pattern Analysis and Machine Intelligence, 44(6), 2872\u20132893.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2009_CR126","doi-asserted-by":"crossref","unstructured":"Yu, H., Ding, S., Li, L., & Wu, J. (2022). Self-attentive CLIP hashing for unsupervised cross-modal retrieval. In ACM Multimedia (pp. 8:1\u20138:7). ACM.","DOI":"10.1145\/3551626.3564945"},{"key":"2009_CR127","doi-asserted-by":"crossref","unstructured":"Yu, Y., Kim, J., & Kim, G. (2018). A joint sequence fusion model for video question answering and retrieval. In ECCV, volume 11211 (pp. 487\u2013503). Springer.","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"2009_CR128","doi-asserted-by":"crossref","unstructured":"Yu, T., Yang, Y., Li, Y., Liu, L., Fei, H., & Li, P. (2021b). Heterogeneous attention network for effective and efficient cross-modal retrieval. In SIGIR (pp. 1146\u20131156). ACM.","DOI":"10.1145\/3404835.3462924"},{"key":"2009_CR129","doi-asserted-by":"crossref","unstructured":"Yu, J., Zhou, H., Zhan, Y., & Tao, D. (2021a). Deep graph-neighbor coherence preserving network for unsupervised cross-modal hashing. In AAAI (pp. 4626\u20134634). AAAI.","DOI":"10.1609\/aaai.v35i5.16592"},{"issue":"8","key":"2009_CR130","doi-asserted-by":"publisher","first-page":"2325","DOI":"10.1007\/s11263-020-01326-x","volume":"128","author":"T Yu","year":"2020","unstructured":"Yu, T., Meng, J., Fang, C., Jin, H., & Yuan, J. (2020). Product quantization network for fast visual search. International Journal of Computer Vision, 128(8), 2325\u20132343.","journal-title":"International Journal of Computer Vision"},{"key":"2009_CR131","doi-asserted-by":"crossref","unstructured":"Zala, A., Cho, J., Kottur, S., Chen, X., Oguz, B., Mehdad, Y., & Bansal, M. (2023). Hierarchical video-moment retrieval and step-captioning. In CVPR (pp. 23056\u201323065). IEEE.","DOI":"10.1109\/CVPR52729.2023.02208"},{"key":"2009_CR132","unstructured":"Zeng, Z., Wang, J., Chen, B., Wang, Y., & Xia, S. (2022). Motion-aware graph reasoning hashing for self-supervised video retrieval. In BMVC (p.\u00a082). BMVA."},{"key":"2009_CR133","unstructured":"Zhang, T., Du, C., & Wang, J. (2014). Composite quantization for approximate nearest neighbor search. In ICML, volume\u00a032 (pp. 838\u2013846). JMLR.org."},{"key":"2009_CR134","doi-asserted-by":"crossref","unstructured":"Zhang, J., Peng, Y., & Yuan, M. (2018). Unsupervised generative adversarial cross-modal hashing. In AAAI (pp. 539\u2013546). AAAI.","DOI":"10.1609\/aaai.v32i1.11263"},{"issue":"10","key":"2009_CR135","doi-asserted-by":"publisher","first-page":"4803","DOI":"10.1109\/TIP.2019.2912290","volume":"28","author":"Z Zhang","year":"2019","unstructured":"Zhang, Z., Lai, Z., Huang, Z., Wong, W. K., Xie, G., Liu, L., & Shao, L. (2019). Scalable supervised asymmetric hashing with semantic and latent factor embedding. IEEE Transactions on Image Processing, 28(10), 4803\u20134818.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2009_CR136","doi-asserted-by":"publisher","first-page":"466","DOI":"10.1109\/TMM.2021.3053766","volume":"24","author":"P Zhang","year":"2022","unstructured":"Zhang, P., Li, Y., Huang, Z., & Xu, X. (2022). Aggregation-based graph convolutional hashing for unsupervised cross-modal retrieval. IEEE Transactions of Multimedia, 24, 466\u2013479.","journal-title":"IEEE Transactions of Multimedia"},{"issue":"2","key":"2009_CR137","doi-asserted-by":"publisher","first-page":"563","DOI":"10.1007\/s11280-020-00859-y","volume":"24","author":"P Zhang","year":"2021","unstructured":"Zhang, P., Luo, Y., Huang, Z., Xu, X., & Song, J. (2021). High-order nonlocal hashing for unsupervised cross-modal retrieval. World Wide Web, 24(2), 563\u2013583.","journal-title":"World Wide Web"},{"issue":"5","key":"2009_CR138","first-page":"5091","volume":"35","author":"Z Zhang","year":"2023","unstructured":"Zhang, Z., Luo, H., Zhu, L., Lu, G., & Shen, H. T. (2023). Modality-invariant asymmetric networks for cross-modal hashing. IEEE Transactions on Knowledge and Data Engineering, 35(5), 5091\u20135104.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"2009_CR139","doi-asserted-by":"crossref","unstructured":"Zhang, J., & Peng, Y. (2020). Multi-pathway generative adversarial hashing for unsupervised cross-modal retrieval. IEEE Transactions of Multimedia, 22(1), 174\u2013187.","DOI":"10.1109\/TMM.2019.2922128"},{"key":"2009_CR140","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109462","volume":"139","author":"Z Zhang","year":"2023","unstructured":"Zhang, Z., Wang, J., Zhu, L., Luo, Y., & Lu, G. (2023). Deep collaborative graph hashing for discriminative image retrieval. Pattern Recognition, 139, 109462.","journal-title":"Pattern Recognition"},{"issue":"11","key":"2009_CR141","doi-asserted-by":"publisher","first-page":"2171","DOI":"10.1109\/TKDE.2019.2913388","volume":"32","author":"C Zheng","year":"2020","unstructured":"Zheng, C., Zhu, L., Lu, X., Li, J., Cheng, Z., & Zhang, H. (2020). Fast discrete collaborative multi-modal hashing for large-scale multimedia retrieval. IEEE Transactions on Knowledge and Data Engineering, 32(11), 2171\u20132184.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"2009_CR142","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Arandjelovic, R., & Zisserman, A. (2018). Ghostvlad for set-based face recognition. In ACCV, volume 11362 (pp. 35\u201350). Springer.","DOI":"10.1007\/978-3-030-20890-5_3"},{"key":"2009_CR143","doi-asserted-by":"crossref","unstructured":"Zhong, Z., Zheng, L., Cao, D., & Li, S. (2017). Re-ranking person re-identification with k-reciprocal encoding. In CVPR (pp. 3652\u20133661). IEEE Computer Society.","DOI":"10.1109\/CVPR.2017.389"},{"key":"2009_CR144","doi-asserted-by":"crossref","unstructured":"Zhou, J., Ding, G., & Guo, Y. (2014). Latent semantic sparse hashing for cross-modal similarity search. In SIGIR (pp. 415\u2013424). ACM.","DOI":"10.1145\/2600428.2609610"},{"key":"2009_CR145","doi-asserted-by":"crossref","unstructured":"Zhu, X., Huang, Z., Shen, H.\u00a0T., & Zhao, X. (2013). Linear cross-modal hashing for efficient multimedia search. In ACM Multimedia (pp. 143\u2013152). ACM.","DOI":"10.1145\/2502081.2502107"},{"key":"2009_CR146","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Kiros, R., Zemel, R.\u00a0S., Salakhutdinov, R., Urtasun, R., Torralba, A., & Fidler, S. (2015). Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. In ICCV (pp. 19\u201327). IEEE Computer Society.","DOI":"10.1109\/ICCV.2015.11"},{"key":"2009_CR147","doi-asserted-by":"crossref","unstructured":"Zhu, H., Long, M., Wang, J., & Cao, Y. (2016). Deep hashing network for efficient similarity retrieval. In AAAI (pp. 2415\u20132421). AAAI.","DOI":"10.1609\/aaai.v30i1.10235"},{"key":"2009_CR148","doi-asserted-by":"crossref","unstructured":"Zhuo, Y., Li, Y., Hsiao, J., Ho, C., & Li, B. (2022). Clip4hashing: Unsupervised deep hashing for cross-modal video-text retrieval. In ICMR, (pp. 158\u2013166). ACM.","DOI":"10.1145\/3512527.3531381"},{"issue":"9","key":"2009_CR149","doi-asserted-by":"publisher","first-page":"8838","DOI":"10.1109\/TKDE.2022.3218656","volume":"35","author":"L Zhu","year":"2023","unstructured":"Zhu, L., Wu, X., Li, J., Zhang, Z., Guan, W., & Shen, H. T. (2023). Work together: Correlation-identity reconstruction hashing for unsupervised cross-modal retrieval. IEEE Transactions on Knowledge and Data Engineering, 35(9), 8838\u20138851.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02009-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02009-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02009-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,11]],"date-time":"2024-07-11T14:11:53Z","timestamp":1720707113000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02009-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,18]]},"references-count":149,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2024,8]]}},"alternative-id":["2009"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02009-7","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,2,18]]},"assertion":[{"value":"13 April 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 January 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 February 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}