{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T06:13:21Z","timestamp":1769926401318,"version":"3.49.0"},"publisher-location":"Cham","reference-count":119,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726323","type":"print"},{"value":"9783031726330","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72633-0_2","type":"book-chapter","created":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T07:57:34Z","timestamp":1732175854000},"page":"21-41","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["IRGen: Generative Modeling for\u00a0Image Retrieval"],"prefix":"10.1007","author":[{"given":"Yidan","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Ting","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Dong","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yujing","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Qi","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Xing","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Weiwei","family":"Deng","sequence":"additional","affiliation":[]},{"given":"Qi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Fan","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Mao","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Qingmin","family":"Liao","sequence":"additional","affiliation":[]},{"given":"Jingdong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Baining","family":"Guo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,22]]},"reference":[{"key":"2_CR1","unstructured":"Adiwardana, D., et\u00a0al.: Towards a human-like open-domain chatbot. arXiv preprint arXiv:2001.09977 (2020)"},{"key":"2_CR2","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. arXiv preprint arXiv:2204.14198 (2022)"},{"key":"2_CR3","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1016\/j.jvcir.2015.07.012","volume":"32","author":"A Alzu\u2019bi","year":"2015","unstructured":"Alzu\u2019bi, A., Amira, A., Ramzan, N.: Semantic content-based image retrieval: a comprehensive study. J. Vis. Commun. Image Represent. 32, 20\u201354 (2015)","journal-title":"J. Vis. Commun. Image Represent."},{"issue":"1","key":"2_CR4","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1145\/1327452.1327494","volume":"51","author":"A Andoni","year":"2008","unstructured":"Andoni, A., Indyk, P.: Near-optimal hashing algorithms for approximate nearest neighbor in high dimensions. Commun. ACM 51(1), 117\u2013122 (2008)","journal-title":"Commun. ACM"},{"key":"2_CR5","doi-asserted-by":"crossref","unstructured":"Babenko, A., Lempitsky, V.: Additive quantization for extreme vector compression. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 931\u2013938 (2014)","DOI":"10.1109\/CVPR.2014.124"},{"issue":"6","key":"2_CR6","doi-asserted-by":"publisher","first-page":"1247","DOI":"10.1109\/TPAMI.2014.2361319","volume":"37","author":"A Babenko","year":"2014","unstructured":"Babenko, A., Lempitsky, V.: The inverted multi-index. IEEE Trans. Pattern Anal. Mach. Intell. 37(6), 1247\u20131260 (2014)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2_CR7","doi-asserted-by":"crossref","unstructured":"Bai, S., Tang, P., Torr, P.H., Latecki, L.J.: Re-ranking via metric fusion for object retrieval and person re-identification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 740\u2013749 (2019)","DOI":"10.1109\/CVPR.2019.00083"},{"key":"2_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"404","DOI":"10.1007\/11744023_32","volume-title":"Computer Vision \u2013 ECCV 2006","author":"H Bay","year":"2006","unstructured":"Bay, H., Tuytelaars, T., Van Gool, L.: SURF: speeded up robust features. In: Leonardis, A., Bischof, H., Pinz, A. (eds.) ECCV 2006. LNCS, vol. 3951, pp. 404\u2013417. Springer, Heidelberg (2006). https:\/\/doi.org\/10.1007\/11744023_32"},{"key":"2_CR9","unstructured":"Bengio, Y., L\u00e9onard, N., Courville, A.: Estimating or propagating gradients through stochastic neurons for conditional computation. arXiv preprint arXiv:1308.3432 (2013)"},{"key":"2_CR10","doi-asserted-by":"crossref","unstructured":"Bentley, J.L.: K-d trees for semidynamic point sets. In: Proceedings of the Sixth Annual Symposium on Computational Geometry, pp. 187\u2013197 (1990)","DOI":"10.1145\/98524.98564"},{"key":"2_CR11","unstructured":"Bevilacqua, M., Ottaviano, G., Lewis, P., Yih, W.T., Riedel, S., Petroni, F.: Autoregressive search engines: generating substrings as document identifiers. arXiv preprint arXiv:2204.10628 (2022)"},{"key":"2_CR12","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"726","DOI":"10.1007\/978-3-030-58565-5_43","volume-title":"Computer Vision \u2013 ECCV 2020","author":"B Cao","year":"2020","unstructured":"Cao, B., Araujo, A., Sim, J.: Unifying deep local and global features for image search. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020, Part XX. LNCS, vol. 12365, pp. 726\u2013743. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_43"},{"key":"2_CR14","doi-asserted-by":"crossref","unstructured":"Cao, Y., Wang, C., Zhang, L., Zhang, L.: Edgel index for large-scale sketch-based image search. In: CVPR 2011, pp. 761\u2013768. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995460"},{"key":"2_CR15","doi-asserted-by":"crossref","unstructured":"Cao, Y., Long, M., Wang, J., Liu, S.: Collective deep quantization for efficient cross-modal retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a031 (2017)","DOI":"10.1609\/aaai.v31i1.11218"},{"key":"2_CR16","doi-asserted-by":"crossref","unstructured":"Cao, Y., Long, M., Wang, J., Zhu, H., Wen, Q.: Deep quantization network for efficient image retrieval. In: Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence, AAAI 2016, pp. 3457\u20133463. AAAI Press (2016)","DOI":"10.1609\/aaai.v30i1.10455"},{"key":"2_CR17","unstructured":"Chen, M., Radford, A., Child, R., Wu, J., Jun, H., Luan, D., Sutskever, I.: Generative pretraining from pixels. In: International Conference on Machine Learning, pp. 1691\u20131703. PMLR (2020)"},{"key":"2_CR18","first-page":"5199","volume":"34","author":"Q Chen","year":"2021","unstructured":"Chen, Q., et al.: SPANN: highly-efficient billion-scale approximate nearest neighborhood search. Adv. Neural. Inf. Process. Syst. 34, 5199\u20135212 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2_CR19","unstructured":"Chen, T., Li, L., Sun, Y.: Differentiable product quantization for end-to-end embedding compression. In: International Conference on Machine Learning. pp. 1617\u20131626. PMLR (2020)"},{"key":"2_CR20","doi-asserted-by":"crossref","unstructured":"Cho, J., Lu, J., Schwenk, D., Hajishirzi, H., Kembhavi, A.: X-LXMERT: paint, caption and answer questions with multi-modal transformers. arXiv preprint arXiv:2009.11278 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.707"},{"key":"2_CR21","unstructured":"Chowdhery, A., et\u00a0al.: PALM: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311 (2022)"},{"key":"2_CR22","unstructured":"Chum, O., Philbin, J., Zisserman, A., et al.: Near duplicate image detection: Min-hash and TF-IDF weighting. In: BMVC, vol. 810, pp. 812\u2013815 (2008)"},{"key":"2_CR23","doi-asserted-by":"crossref","unstructured":"Chung, Y.A., Hsu, W.N., Tang, H., Glass, J.: An unsupervised autoregressive model for speech representation learning. arXiv preprint arXiv:1904.03240 (2019)","DOI":"10.21437\/Interspeech.2019-1473"},{"key":"2_CR24","doi-asserted-by":"crossref","unstructured":"De\u00a0Cao, N., Aziz, W., Titov, I.: Highly parallel autoregressive entity linking with discriminative correction. arXiv preprint arXiv:2109.03792 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.604"},{"key":"2_CR25","unstructured":"De\u00a0Cao, N., Izacard, G., Riedel, S., Petroni, F.: Autoregressive entity retrieval. arXiv preprint arXiv:2010.00904 (2020)"},{"key":"2_CR26","doi-asserted-by":"publisher","first-page":"274","DOI":"10.1162\/tacl_a_00460","volume":"10","author":"N De Cao","year":"2022","unstructured":"De Cao, N., et al.: Multilingual autoregressive entity linking. Trans. Assoc. Comput. Linguist. 10, 274\u2013290 (2022)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"2_CR27","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2_CR28","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"2_CR29","unstructured":"Dhariwal, P., Jun, H., Payne, C., Kim, J.W., Radford, A., Sutskever, I.: Jukebox: a generative model for music. arXiv preprint arXiv:2005.00341 (2020)"},{"key":"2_CR30","unstructured":"Ding, M., Zheng, W., Hong, W., Tang, J.: CogView2: faster and better text-to-image generation via hierarchical transformers. arXiv preprint arXiv:2204.14217 (2022)"},{"key":"2_CR31","doi-asserted-by":"crossref","unstructured":"Dizaji, K.G., Zheng, F., Sadoughi, N., Yang, Y., Deng, C., Huang, H.: Unsupervised deep generative adversarial hashing network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3664\u20133673 (2018)","DOI":"10.1109\/CVPR.2018.00386"},{"key":"2_CR32","unstructured":"Du, N., et\u00a0al.: GLaM: efficient scaling of language models with mixture-of-experts. In: International Conference on Machine Learning, pp. 5547\u20135569. PMLR (2022)"},{"key":"2_CR33","unstructured":"El-Nouby, A., Neverova, N., Laptev, I., J\u00e9gou, H.: Training vision transformers for image retrieval. arXiv preprint arXiv:2102.05644 (2021)"},{"key":"2_CR34","doi-asserted-by":"crossref","unstructured":"Erin\u00a0Liong, V., Lu, J., Wang, G., Moulin, P., Zhou, J.: Deep hashing for compact binary codes learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2475\u20132483 (2015)","DOI":"10.1109\/CVPR.2015.7298862"},{"key":"2_CR35","doi-asserted-by":"crossref","unstructured":"Gafni, O., Polyak, A., Ashual, O., Sheynin, S., Parikh, D., Taigman, Y.: Make-a-scene: scene-based text-to-image generation with human priors. arXiv preprint arXiv:2203.13131 (2022)","DOI":"10.1007\/978-3-031-19784-0_6"},{"issue":"4","key":"2_CR36","doi-asserted-by":"publisher","first-page":"744","DOI":"10.1109\/TPAMI.2013.240","volume":"36","author":"T Ge","year":"2013","unstructured":"Ge, T., He, K., Ke, Q., Sun, J.: Optimized product quantization. IEEE Trans. Pattern Anal. Mach. Intell. 36(4), 744\u2013755 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"12","key":"2_CR37","doi-asserted-by":"publisher","first-page":"2916","DOI":"10.1109\/TPAMI.2012.193","volume":"35","author":"Y Gong","year":"2012","unstructured":"Gong, Y., Lazebnik, S., Gordo, A., Perronnin, F.: Iterative quantization: a procrustean approach to learning binary codes for large-scale image retrieval. IEEE Trans. Pattern Anal. Mach. Intell. 35(12), 2916\u20132929 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2_CR38","unstructured":"Gulrajani, I., et al.: PixelVAE: a latent variable model for natural images. arXiv preprint arXiv:1611.05013 (2016)"},{"key":"2_CR39","unstructured":"Guo, R., et al.: Accelerating large-scale inference with anisotropic vector quantization. In: International Conference on Machine Learning, pp. 3887\u20133896. PMLR (2020)"},{"key":"2_CR40","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2_CR41","doi-asserted-by":"crossref","unstructured":"Indyk, P., Motwani, R.: Approximate nearest neighbors: towards removing the curse of dimensionality. In: Proceedings of the Thirtieth Annual ACM Symposium on Theory of Computing, pp. 604\u2013613 (1998)","DOI":"10.1145\/276698.276876"},{"key":"2_CR42","doi-asserted-by":"crossref","unstructured":"Jang, Y.K., Cho, N.I.: Generalized product quantization network for semi-supervised image retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3420\u20133429 (2020)","DOI":"10.1109\/CVPR42600.2020.00348"},{"key":"2_CR43","doi-asserted-by":"crossref","unstructured":"Jang, Y.K., Cho, N.I.: Self-supervised product quantization for deep unsupervised image retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12085\u201312094 (2021)","DOI":"10.1109\/ICCV48922.2021.01187"},{"key":"2_CR44","unstructured":"Jayaram\u00a0Subramanya, S., Devvrit, F., Simhadri, H.V., Krishnawamy, R., Kadekodi, R.: DiskANN: fast accurate billion-point nearest neighbor search on a single node. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"issue":"1","key":"2_CR45","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1109\/TPAMI.2010.57","volume":"33","author":"H Jegou","year":"2010","unstructured":"Jegou, H., Douze, M., Schmid, C.: Product quantization for nearest neighbor search. IEEE Trans. Pattern Anal. Mach. Intell. 33(1), 117\u2013128 (2010)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2_CR46","doi-asserted-by":"crossref","unstructured":"J\u00e9gou, H., Douze, M., Schmid, C., P\u00e9rez, P.: Aggregating local descriptors into a compact image representation. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 3304\u20133311. IEEE (2010)","DOI":"10.1109\/CVPR.2010.5540039"},{"issue":"3","key":"2_CR47","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1109\/TBDATA.2019.2921572","volume":"7","author":"J Johnson","year":"2019","unstructured":"Johnson, J., Douze, M., J\u00e9gou, H.: Billion-scale similarity search with GPUs. IEEE Trans. Big Data 7(3), 535\u2013547 (2019)","journal-title":"IEEE Trans. Big Data"},{"key":"2_CR48","unstructured":"Jun, H., Ko, B., Kim, Y., Kim, I., Kim, J.: Combination of multiple global descriptors for image retrieval. arXiv preprint arXiv:1903.10663 (2019)"},{"key":"2_CR49","doi-asserted-by":"publisher","unstructured":"Klein, B., Wolf, L.: End-to-end supervised product quantization for image search and retrieval. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5036\u20135045 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00518","DOI":"10.1109\/CVPR.2019.00518"},{"key":"2_CR50","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3D object representations for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, pp. 554\u2013561 (2013)","DOI":"10.1109\/ICCVW.2013.77"},{"issue":"6","key":"2_CR51","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. Commun. ACM 60(6), 84\u201390 (2017)","journal-title":"Commun. ACM"},{"key":"2_CR52","doi-asserted-by":"crossref","unstructured":"Lagunes-Fortiz, M., Damen, D., Mayol-Cuevas, W.: Centroids triplet network and temporally-consistent embeddings for in-situ object recognition. In: 2020 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 10796\u201310802. IEEE (2020)","DOI":"10.1109\/IROS45743.2020.9341050"},{"key":"2_CR53","doi-asserted-by":"crossref","unstructured":"Lee, D., Kim, C., Kim, S., Cho, M., Han, W.S.: Autoregressive image generation using residual quantization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11523\u201311532 (2022)","DOI":"10.1109\/CVPR52688.2022.01123"},{"key":"2_CR54","doi-asserted-by":"crossref","unstructured":"Lee, S., Seong, H., Lee, S., Kim, E.: Correlation verification for image retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5374\u20135384 (2022)","DOI":"10.1109\/CVPR52688.2022.00530"},{"key":"2_CR55","doi-asserted-by":"crossref","unstructured":"Lew, M.S., Sebe, N., Djeraba, C., Jain, R.: Content-based multimedia information retrieval: state of the art and challenges. ACM Trans. Multimed. Comput. Commun. Appl. (TOMM) 2(1), 1\u201319 (2006)","DOI":"10.1145\/1126004.1126005"},{"key":"2_CR56","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"issue":"1","key":"2_CR57","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2906152","volume":"49","author":"X Li","year":"2016","unstructured":"Li, X., Uricchio, T., Ballan, L., Bertini, M., Snoek, C.G., Bimbo, A.D.: Socializing the semantic gap: a comparative survey on image tag assignment, refinement, and retrieval. ACM Comput. Surv. (CSUR) 49(1), 1\u201339 (2016)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"2_CR58","doi-asserted-by":"crossref","unstructured":"Liu, B., Cao, Y., Long, M., Wang, J., Wang, J.: Deep triplet quantization. In: Proceedings of the 26th ACM International Conference on Multimedia, pp. 755\u2013763 (2018)","DOI":"10.1145\/3240508.3240516"},{"issue":"1","key":"2_CR59","doi-asserted-by":"publisher","first-page":"262","DOI":"10.1016\/j.patcog.2006.04.045","volume":"40","author":"Y Liu","year":"2007","unstructured":"Liu, Y., Zhang, D., Lu, G., Ma, W.Y.: A survey of content-based image retrieval with high-level semantics. Pattern Recogn. 40(1), 262\u2013282 (2007)","journal-title":"Pattern Recogn."},{"key":"2_CR60","doi-asserted-by":"crossref","unstructured":"Liu, Z., Cheng, K.T., Huang, D., Xing, E.P., Shen, Z.: Nonuniform-to-uniform quantization: towards accurate quantization via generalized straight-through estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4942\u20134952 (2022)","DOI":"10.1109\/CVPR52688.2022.00489"},{"key":"2_CR61","doi-asserted-by":"crossref","unstructured":"Liu, Z., Luo, P., Qiu, S., Wang, X., Tang, X.: DeepFashion: powering robust clothes recognition and retrieval with rich annotations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1096\u20131104 (2016)","DOI":"10.1109\/CVPR.2016.124"},{"key":"2_CR62","doi-asserted-by":"crossref","unstructured":"Lowe, D.G.: Object recognition from local scale-invariant features. In: Proceedings of the Seventh IEEE International Conference on Computer Vision, vol.\u00a02, pp. 1150\u20131157. IEEE (1999)","DOI":"10.1109\/ICCV.1999.790410"},{"issue":"4","key":"2_CR63","doi-asserted-by":"publisher","first-page":"824","DOI":"10.1109\/TPAMI.2018.2889473","volume":"42","author":"YA Malkov","year":"2018","unstructured":"Malkov, Y.A., Yashunin, D.A.: Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. IEEE Trans. Pattern Anal. Mach. Intell. 42(4), 824\u2013836 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2_CR64","unstructured":"Martinez, J., Hoos, H.H., Little, J.J.: Stacked quantizers for compositional vector compression. arXiv preprint arXiv:1411.2173 (2014)"},{"key":"2_CR65","doi-asserted-by":"crossref","unstructured":"Noh, H., Araujo, A., Sim, J., Weyand, T., Han, B.: Large-scale image retrieval with attentive deep local features. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3456\u20133465 (2017)","DOI":"10.1109\/ICCV.2017.374"},{"key":"2_CR66","doi-asserted-by":"crossref","unstructured":"Norouzi, M., Fleet, D.J.: Cartesian k-means. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3017\u20133024 (2013)","DOI":"10.1109\/CVPR.2013.388"},{"key":"2_CR67","unstructured":"Ouyang, L., et\u00a0al.: Training language models to follow instructions with human feedback. arXiv preprint arXiv:2203.02155 (2022)"},{"key":"2_CR68","doi-asserted-by":"crossref","unstructured":"Park, M., Jin, J.S., Wilson, L.S.: Fast content-based image retrieval using quasi-gabor filter and reduction of image feature dimension. In: Proceedings fifth IEEE Southwest Symposium on Image Analysis and Interpretation, pp. 178\u2013182. IEEE (2002)","DOI":"10.1109\/IAI.2002.999914"},{"key":"2_CR69","doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Mei, T.: Deep quantization: encoding convolutional activations with deep generative model. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6759\u20136768 (2017)","DOI":"10.1109\/CVPR.2017.435"},{"key":"2_CR70","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"2_CR71","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"issue":"8","key":"2_CR72","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"2_CR73","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"2_CR74","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. PMLR (2021)"},{"issue":"3","key":"2_CR75","first-page":"251","volume":"4","author":"AS Razavian","year":"2016","unstructured":"Razavian, A.S., Sullivan, J., Carlsson, S., Maki, A.: Visual instance retrieval with deep convolutional networks. ITE Trans. Media Technol. Appl. 4(3), 251\u2013258 (2016)","journal-title":"ITE Trans. Media Technol. Appl."},{"key":"2_CR76","first-page":"10672","volume":"33","author":"J Ren","year":"2020","unstructured":"Ren, J., Zhang, M., Li, D.: HM-ANN: efficient billion-point nearest neighbor search on heterogeneous memory. Adv. Neural. Inf. Process. Syst. 33, 10672\u201310684 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2_CR77","doi-asserted-by":"crossref","unstructured":"Revaud, J., Almaz\u00e1n, J., Rezende, R.S., Souza, C.R.D.: Learning with average precision: Training image retrieval with a listwise loss. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5107\u20135116 (2019)","DOI":"10.1109\/ICCV.2019.00521"},{"key":"2_CR78","doi-asserted-by":"crossref","unstructured":"Sharif\u00a0Razavian, A., Azizpour, H., Sullivan, J., Carlsson, S.: CNN features off-the-shelf: an astounding baseline for recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 806\u2013813 (2014)","DOI":"10.1109\/CVPRW.2014.131"},{"key":"2_CR79","unstructured":"Shoeybi, M., Patwary, M., Puri, R., LeGresley, P., Casper, J., Catanzaro, B.: Megatron-LM: training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)"},{"issue":"2","key":"2_CR80","doi-asserted-by":"publisher","first-page":"300","DOI":"10.1109\/TPAMI.2007.40","volume":"29","author":"C Siagian","year":"2007","unstructured":"Siagian, C., Itti, L.: Rapid biologically-inspired scene classification using features shared with visual attention. IEEE Trans. Pattern Anal. Mach. Intell. 29(2), 300\u2013312 (2007)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2_CR81","doi-asserted-by":"crossref","unstructured":"Sim\u00e9oni, O., Avrithis, Y., Chum, O.: Local features and visual words emerge in activations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11651\u201311660 (2019)","DOI":"10.1109\/CVPR.2019.01192"},{"key":"2_CR82","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"issue":"12","key":"2_CR83","doi-asserted-by":"publisher","first-page":"1349","DOI":"10.1109\/34.895972","volume":"22","author":"AW Smeulders","year":"2000","unstructured":"Smeulders, A.W., Worring, M., Santini, S., Gupta, A., Jain, R.: Content-based image retrieval at the end of the early years. IEEE Trans. Pattern Anal. Mach. Intell. 22(12), 1349\u20131380 (2000)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2_CR84","doi-asserted-by":"crossref","unstructured":"Song, J., He, T., Gao, L., Xu, X., Hanjalic, A., Shen, H.T.: Binary generative adversarial networks for image retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.11276"},{"key":"2_CR85","doi-asserted-by":"crossref","unstructured":"Tan, F., Yuan, J., Ordonez, V.: Instance-level image retrieval using reranking transformers. In: proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12105\u201312115 (2021)","DOI":"10.1109\/ICCV48922.2021.01189"},{"key":"2_CR86","unstructured":"Tay, Y., et\u00a0al.: Transformer memory as a differentiable search index. arXiv preprint arXiv:2202.06991 (2022)"},{"key":"2_CR87","doi-asserted-by":"crossref","unstructured":"Teichmann, M., Araujo, A., Zhu, M., Sim, J.: Detect-to-retrieve: efficient regional aggregation for image search. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5109\u20135118 (2019)","DOI":"10.1109\/CVPR.2019.00525"},{"key":"2_CR88","unstructured":"Van Den\u00a0Oord, A., Vinyals, O., et\u00a0al.: Neural discrete representation learning. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"2_CR89","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"2_CR90","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: The caltech-UCSD birds-200-2011 dataset (2011)"},{"issue":"1","key":"2_CR91","doi-asserted-by":"publisher","first-page":"180","DOI":"10.1109\/TKDE.2014.2324592","volume":"27","author":"J Wang","year":"2014","unstructured":"Wang, J., Wang, J., Song, J., Xu, X.S., Shen, H.T., Li, S.: Optimized cartesian k-means. IEEE Trans. Knowl. Data Eng. 27(1), 180\u2013192 (2014)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"issue":"1","key":"2_CR92","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2036264.2036276","volume":"3","author":"J Wang","year":"2011","unstructured":"Wang, J., Hua, X.S.: Interactive image search by color map. ACM Trans. Intell. Syst. Technol. (TIST) 3(1), 1\u201323 (2011)","journal-title":"ACM Trans. Intell. Syst. Technol. (TIST)"},{"issue":"6","key":"2_CR93","doi-asserted-by":"publisher","first-page":"1308","DOI":"10.1109\/TPAMI.2018.2835468","volume":"41","author":"J Wang","year":"2018","unstructured":"Wang, J., Zhang, T.: Composite quantization. IEEE Trans. Pattern Anal. Mach. Intell. 41(6), 1308\u20131322 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2_CR94","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Image as a foreign language: BEIT pretraining for all vision and vision-language tasks. arXiv preprint arXiv:2208.10442 (2022)","DOI":"10.1109\/CVPR52729.2023.01838"},{"issue":"3","key":"2_CR95","doi-asserted-by":"publisher","first-page":"545","DOI":"10.1007\/s11042-012-1055-7","volume":"68","author":"XY Wang","year":"2014","unstructured":"Wang, X.Y., Zhang, B.B., Yang, H.Y.: Content-based image retrieval by integrating color and texture features. Multimed. Tools Appl. 68(3), 545\u2013569 (2014)","journal-title":"Multimed. Tools Appl."},{"key":"2_CR96","unstructured":"Wang, Y., et\u00a0al.: A neural corpus indexer for document retrieval. arXiv preprint arXiv:2206.02743 (2022)"},{"key":"2_CR97","unstructured":"Weissenborn, D., T\u00e4ckstr\u00f6m, O., Uszkoreit, J.: Scaling autoregressive video models. arXiv preprint arXiv:1906.02634 (2019)"},{"key":"2_CR98","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"499","DOI":"10.1007\/978-3-319-46478-7_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"Y Wen","year":"2016","unstructured":"Wen, Y., Zhang, K., Li, Z., Qiao, Yu.: A discriminative feature learning approach for deep face recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9911, pp. 499\u2013515. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_31"},{"key":"2_CR99","doi-asserted-by":"crossref","unstructured":"Wengert, C., Douze, M., J\u00e9gou, H.: Bag-of-colors for improved image search. In: Proceedings of the 19th ACM International Conference on Multimedia, pp. 1437\u20131440 (2011)","DOI":"10.1145\/2072298.2072034"},{"key":"2_CR100","series-title":"Communications in Computer and Information Science","doi-asserted-by":"publisher","first-page":"294","DOI":"10.1007\/978-3-030-63820-7_33","volume-title":"Neural Information Processing","author":"M Wieczorek","year":"2020","unstructured":"Wieczorek, M., Michalowski, A., Wroblewska, A., Dabrowski, J.: A strong baseline for fashion retrieval with person re-identification models. In: Yang, H., Pasupa, K., Leung, A.C.-S., Kwok, J.T., Chan, J.H., King, I. (eds.) ICONIP 2020. CCIS, vol. 1332, pp. 294\u2013301. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-63820-7_33"},{"key":"2_CR101","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"720","DOI":"10.1007\/978-3-031-19787-1_41","volume-title":"Computer Vision \u2013 ECCV 2022","author":"C Wu","year":"2022","unstructured":"Wu, C., et al.: N\u00dcWA: visual synthesis pre-training for neural visual world creation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13676, pp. 720\u2013736. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19787-1_41"},{"key":"2_CR102","doi-asserted-by":"crossref","unstructured":"Xia, Y., He, K., Wen, F., Sun, J.: Joint inverted indexing. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3416\u20133423 (2013)","DOI":"10.1109\/ICCV.2013.424"},{"key":"2_CR103","doi-asserted-by":"crossref","unstructured":"Xiao, T., Li, H., Ouyang, W., Wang, X.: Learning deep feature representations with domain guided dropout for person re-identification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1249\u20131258 (2016)","DOI":"10.1109\/CVPR.2016.140"},{"key":"2_CR104","unstructured":"Xu, Y., et\u00a0al.: GSPMD: general and scalable parallelization for ML computation graphs. arXiv preprint arXiv:2105.04663 (2021)"},{"key":"2_CR105","doi-asserted-by":"crossref","unstructured":"Yang, M., et al.: DOLG: single-stage image retrieval with deep orthogonal fusion of local and global features. In: Proceedings of the IEEE\/CVF International conference on Computer Vision, pp. 11772\u201311781 (2021)","DOI":"10.1109\/ICCV48922.2021.01156"},{"key":"2_CR106","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R.R., Le, Q.V.: XLNet: generalized autoregressive pretraining for language understanding. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"2_CR107","unstructured":"Yu, J., et al.: Vector-quantized image modeling with improved vqgan. arXiv preprint arXiv:2110.04627 (2021)"},{"key":"2_CR108","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"2_CR109","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789 (2022)"},{"key":"2_CR110","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Chen, W., Yang, Y., Wang, Z.: In defense of the triplet loss again: learning robust person re-identification with fast approximated triplet loss and label distillation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 354\u2013355 (2020)","DOI":"10.1109\/CVPRW50498.2020.00185"},{"key":"2_CR111","unstructured":"Zhai, A., Wu, H.Y.: Classification is a strong baseline for deep metric learning. arXiv preprint arXiv:1811.12649 (2018)"},{"key":"2_CR112","doi-asserted-by":"crossref","unstructured":"Zhan, J., Mao, J., Liu, Y., Guo, J., Zhang, M., Ma, S.: Jointly optimizing query encoder and product quantization to improve retrieval performance. In: Proceedings of the 30th ACM International Conference on Information & Knowledge Management, pp. 2487\u20132496 (2021)","DOI":"10.1145\/3459637.3482358"},{"key":"2_CR113","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: Joint learning of deep retrieval model and product quantization based embedding index. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1718\u20131722 (2021)","DOI":"10.1145\/3404835.3462988"},{"key":"2_CR114","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rui, Y.: Image search-from thousands to billions in 20 years. ACM Trans. Multimed. Comput. Commun. Appl. (TOMM) 9(1s), 1\u201320 (2013)","DOI":"10.1145\/2490823"},{"key":"2_CR115","doi-asserted-by":"crossref","unstructured":"Zhou, B., Lapedriza, A., Khosla, A., Oliva, A., Torralba, A.: Places: a 10 million image database for scene recognition. IEEE Trans. Pattern Anal. Mach. Intell. (2017)","DOI":"10.1167\/17.10.296"},{"issue":"4","key":"2_CR116","doi-asserted-by":"publisher","first-page":"1638","DOI":"10.1109\/TNNLS.2020.3043103","volume":"33","author":"C Zhou","year":"2020","unstructured":"Zhou, C., Po, L.M., Ou, W.: Angular deep supervised vector quantization for image retrieval. IEEE Trans. Neural Netw. Learn. Syst. 33(4), 1638\u20131649 (2020)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"2_CR117","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, Y., Cavallaro, A., Xiang, T.: Omni-scale feature learning for person re-identification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3702\u20133712 (2019)","DOI":"10.1109\/ICCV.2019.00380"},{"key":"2_CR118","unstructured":"Zhou, W., Li, H., Tian, Q.: Recent advance in content-based image retrieval: a literature survey. arXiv preprint arXiv:1706.06064 (2017)"},{"key":"2_CR119","doi-asserted-by":"crossref","unstructured":"Zhu, H., Long, M., Wang, J., Cao, Y.: Deep hashing network for efficient similarity retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a030 (2016)","DOI":"10.1609\/aaai.v30i1.10235"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72633-0_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T08:05:47Z","timestamp":1732176347000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72633-0_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,22]]},"ISBN":["9783031726323","9783031726330"],"references-count":119,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72633-0_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,22]]},"assertion":[{"value":"22 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}