{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T13:37:38Z","timestamp":1767706658950,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":50,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819755547"},{"type":"electronic","value":"9789819755554"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-5555-4_30","type":"book-chapter","created":{"date-parts":[[2025,1,11]],"date-time":"2025-01-11T05:39:25Z","timestamp":1736573965000},"page":"419-434","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Flickr30K-CFQ: A Compact and Fragmented Query Dataset for Text-image Retrieval"],"prefix":"10.1007","author":[{"given":"Haoyu","family":"Liu","sequence":"first","affiliation":[]},{"given":"Yaoxian","family":"Song","sequence":"additional","affiliation":[]},{"given":"Xuwu","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xiangru","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Zhixu","family":"Li","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Song","sequence":"additional","affiliation":[]},{"given":"Tiefeng","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,12]]},"reference":[{"key":"30_CR1","unstructured":"Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality | LMSYS Org. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna"},{"key":"30_CR2","doi-asserted-by":"publisher","unstructured":"Brown, T.B., Mann, B., Ryder, N., Subbiah, M., Kaplan, J., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., Agarwal, S., Herbert-Voss, A., Krueger, G., Henighan, T., Child, R., Ramesh, A., Ziegler, D.M., Wu, J., Winter, C., Hesse, C., Chen, M., Sigler, E., Litwin, M., Gray, S., Chess, B., Clark, J., Berner, C., McCandlish, S., Radford, A., Sutskever, I., Amodei, D.: Language Models are Few-Shot Learners (Jul 2020). https:\/\/doi.org\/10.48550\/arXiv.2005.14165","DOI":"10.48550\/arXiv.2005.14165"},{"key":"30_CR3","doi-asserted-by":"crossref","unstructured":"Chen, T., Deng, J., Luo, J.: Adaptive offline quintuplet loss for image-text matching. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XIII 16. pp. 549\u2013565. Springer (2020)","DOI":"10.1007\/978-3-030-58601-0_33"},{"key":"30_CR4","doi-asserted-by":"crossref","unstructured":"Chen, X., Lu, Y., Wang, Y., Yang, J.: Cmbf: Cross-modal-based fusion recommendation algorithm. Sensors 21(16), \u00a05275 (2021)","DOI":"10.3390\/s21165275"},{"key":"30_CR5","doi-asserted-by":"publisher","unstructured":"Chen, X., Fang, H., Lin, T.Y., Vedantam, R., Gupta, S., Dollar, P., Zitnick, C.L.: Microsoft COCO Captions: Data Collection and Evaluation Server (Apr 2015https:\/\/doi.org\/10.48550\/arXiv.1504.00325","DOI":"10.48550\/arXiv.1504.00325"},{"key":"30_CR6","doi-asserted-by":"crossref","unstructured":"Cheng, M., Jing, L., Ng, M.K.: Robust unsupervised cross-modal hashing for multimedia retrieval. ACM Transactions on Information Systems (TOIS) 38(3), 1\u201325 (2020)","DOI":"10.1145\/3389547"},{"key":"30_CR7","doi-asserted-by":"crossref","unstructured":"Chua, T.S., Tang, J., Hong, R., Li, H., Luo, Z., Zheng, Y.: Nus-wide: a real-world web image database from national university of singapore. In: Proceedings of the ACM international conference on image and video retrieval. pp.\u00a01\u20139 (2009)","DOI":"10.1145\/1646396.1646452"},{"key":"30_CR8","doi-asserted-by":"publisher","unstructured":"Chua, T.S., Tang, J., Hong, R., Li, H., Luo, Z., Zheng, Y.: NUS-WIDE: A real-world web image database from National University of Singapore. In: Proceedings of the ACM International Conference on Image and Video Retrieval. pp.\u00a01\u20139. ACM, Santorini, Fira Greece (Jul 2009).https:\/\/doi.org\/10.1145\/1646396.1646452","DOI":"10.1145\/1646396.1646452"},{"key":"30_CR9","doi-asserted-by":"publisher","unstructured":"Craswell, N., Campos, D., Mitra, B., Yilmaz, E., Billerbeck, B.: ORCAS: 18 Million Clicked Query-Document Pairs for Analyzing Search (Aug 2020).https:\/\/doi.org\/10.48550\/arXiv.2006.05324","DOI":"10.48550\/arXiv.2006.05324"},{"key":"30_CR10","doi-asserted-by":"crossref","unstructured":"Diao, H., Zhang, Y., Ma, L., Lu, H.: Similarity reasoning and filtration for image-text matching. In: Proceedings of the AAAI conference on artificial intelligence. vol.\u00a035, pp. 1218\u20131226 (2021)","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"30_CR11","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: Vse++: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)"},{"key":"30_CR12","doi-asserted-by":"crossref","unstructured":"Fan, Z., Wei, Z., Li, Z., Wang, S., Fan, J.: Negative sample is negative in its own way: Tailoring negative sentences for image-text retrieval. arXiv preprint arXiv:2111.03349 (2021)","DOI":"10.18653\/v1\/2022.findings-naacl.204"},{"key":"30_CR13","doi-asserted-by":"crossref","unstructured":"Feng, D., He, X., Peng, Y.: Mkvse: Multimodal knowledge enhanced visual-semantic embedding for image-text retrieval. ACM Transactions on Multimedia Computing, Communications and Applications 19(5), 1\u201321 (2023)","DOI":"10.1145\/3580501"},{"key":"30_CR14","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q., Sung, Y.H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International conference on machine learning. pp. 4904\u20134916. PMLR (2021)"},{"key":"30_CR15","doi-asserted-by":"publisher","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q.V., Sung, Y., Li, Z., Duerig, T.: Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision (Jun 2021).https:\/\/doi.org\/10.48550\/arXiv.2102.05918","DOI":"10.48550\/arXiv.2102.05918"},{"key":"30_CR16","unstructured":"Kim, W., Son, B., Kim, I.: Vilt: Vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning. pp. 5583\u20135594. PMLR (2021)"},{"key":"30_CR17","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014)"},{"key":"30_CR18","doi-asserted-by":"crossref","unstructured":"Lee, K.H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: Proceedings of the European conference on computer vision (ECCV). pp. 201\u2013216 (2018)","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"30_CR19","unstructured":"Li, J., Mo, W., Qiang, W., Su, B., Zheng, C.: Supporting vision-language model inference with causality-pruning knowledge prompt. arXiv preprint arXiv:2205.11100 (2022)"},{"key":"30_CR20","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34, 9694\u20139705 (2021)"},{"key":"30_CR21","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp. 4654\u20134662 (2019)","DOI":"10.1109\/ICCV.2019.00475"},{"key":"30_CR22","doi-asserted-by":"publisher","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: A Simple and Performant Baseline for Vision and Language (Aug 2019https:\/\/doi.org\/10.48550\/arXiv.1908.03557","DOI":"10.48550\/arXiv.1908.03557"},{"key":"30_CR23","doi-asserted-by":"crossref","unstructured":"Li, L.H., Zhang, P., Zhang, H., Yang, J., Li, C., Zhong, Y., Wang, L., Yuan, L., Zhang, L., Hwang, J.N., et\u00a0al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"30_CR24","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, X., Li, C., Zhang, P., Hu, X., Zhang, L., Wang, L., Hu, H., Dong, L., Wei, F., et\u00a0al.: Oscar: Object-semantics aligned pre-training for vision-language tasks. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX 16. pp. 121\u2013137. Springer (2020)","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"30_CR25","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Bourdev, L., Girshick, R., Hays, J., Perona, P., Ramanan, D., Zitnick, C.L., Doll\u00e1r, P.: Microsoft COCO: Common Objects in Context (Feb 2015)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"30_CR26","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"30_CR27","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)"},{"key":"30_CR28","doi-asserted-by":"publisher","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks (Aug 2019https:\/\/doi.org\/10.48550\/arXiv.1908.02265","DOI":"10.48550\/arXiv.1908.02265"},{"key":"30_CR29","doi-asserted-by":"publisher","unstructured":"L\u00fcddecke, T., Ecker, A.S.: Image Segmentation Using Text and Image Prompts (Mar 2022https:\/\/doi.org\/10.48550\/arXiv.2112.10003","DOI":"10.48550\/arXiv.2112.10003"},{"key":"30_CR30","doi-asserted-by":"publisher","unstructured":"Manning, C., Surdeanu, M., Bauer, J., Finkel, J., Bethard, S., McClosky, D.: The Stanford CoreNLP Natural Language Processing Toolkit. In: Proceedings of 52nd Annual Meeting of the Association for Computational Linguistics: System Demonstrations. pp. 55\u201360. Association for Computational Linguistics, Baltimore, Maryland (2014).https:\/\/doi.org\/10.3115\/v1\/P14-5010","DOI":"10.3115\/v1\/P14-5010"},{"key":"30_CR31","doi-asserted-by":"crossref","unstructured":"Peng, Y., Huang, X., Zhao, Y.: An overview of cross-media retrieval: Concepts, methodologies, benchmarks, and challenges. IEEE Transactions on circuits and systems for video technology 28(9), 2372\u20132385 (2017)","DOI":"10.1109\/TCSVT.2017.2705068"},{"key":"30_CR32","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models (Sep 2016)","DOI":"10.1109\/ICCV.2015.303"},{"key":"30_CR33","doi-asserted-by":"publisher","unstructured":"Qi, D., Su, L., Song, J., Cui, E., Bharti, T., Sacheti, A.: ImageBERT: Cross-modal Pre-training with Large-scale Weak-supervised Image-Text Data (Jan 2020https:\/\/doi.org\/10.48550\/arXiv.2001.07966","DOI":"10.48550\/arXiv.2001.07966"},{"key":"30_CR34","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR (2021)"},{"key":"30_CR35","doi-asserted-by":"publisher","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., Liu, P.J.: Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer (Jul 2020).https:\/\/doi.org\/10.48550\/arXiv.1910.10683","DOI":"10.48550\/arXiv.1910.10683"},{"key":"30_CR36","doi-asserted-by":"crossref","unstructured":"Rao, J., Wang, F., Ding, L., Qi, S., Zhan, Y., Liu, W., Tao, D.: Where does the performance improvement come from? -a reproducibility concern about image-text retrieval. In: Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval. pp. 2727\u20132737 (2022)","DOI":"10.1145\/3477495.3531715"},{"key":"30_CR37","doi-asserted-by":"crossref","unstructured":"Rasiwasia, N., Costa\u00a0Pereira, J., Coviello, E., Doyle, G., Lanckriet, G.R., Levy, R., Vasconcelos, N.: A new approach to cross-modal multimedia retrieval. In: Proceedings of the 18th ACM international conference on Multimedia. pp. 251\u2013260 (2010)","DOI":"10.1145\/1873951.1873987"},{"key":"30_CR38","doi-asserted-by":"publisher","unstructured":"Wang, B., Yang, Y., Xu, X., Hanjalic, A., Shen, H.T.: Adversarial cross-modal retrieval. In: Proceedings of the 25th ACM International Conference on Multimedia. pp. 154\u2013162 (2017https:\/\/doi.org\/10.1145\/3123266.3123326","DOI":"10.1145\/3123266.3123326"},{"key":"30_CR39","doi-asserted-by":"crossref","unstructured":"Wang, W., Bao, H., Dong, L., Bjorck, J., Peng, Z., Liu, Q., Aggarwal, K., Mohammed, O.K., Singhal, S., Som, S., et\u00a0al.: Image as a foreign language: Beit pretraining for all vision and vision-language tasks. arXiv preprint arXiv:2208.10442 (2022)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"30_CR40","doi-asserted-by":"crossref","unstructured":"Wang, Y., Jian, X., Xue, B.: Balance act: Mitigating hubness in cross-modal retrieval with query and gallery banks. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing. pp. 10542\u201310567 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.652"},{"key":"30_CR41","doi-asserted-by":"crossref","unstructured":"Wang, Z., Liu, X., Li, H., Sheng, L., Yan, J., Wang, X., Shao, J.: Camp: Cross-modal adaptive message passing for text-image retrieval. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp. 5764\u20135773 (2019)","DOI":"10.1109\/ICCV.2019.00586"},{"key":"30_CR42","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhao, Y., Lu, C., Wei, S., Liu, L., Zhu, Z., Yan, S.: Cross-modal retrieval with CNN visual features: A new baseline. IEEE transactions on cybernetics 47(2), 449\u2013460 (2016)","DOI":"10.1109\/TCYB.2016.2519449"},{"key":"30_CR43","doi-asserted-by":"crossref","unstructured":"Wu, Y., Wang, S., Song, G., Huang, Q.: Learning fragment self-attention embeddings for image-text matching. In: Proceedings of the 27th ACM international conference on multimedia. pp. 2088\u20132096 (2019)","DOI":"10.1145\/3343031.3350940"},{"key":"30_CR44","unstructured":"Xia, J., Wu, L., Wang, G., Chen, J., Li, S.Z.: Progcl: Rethinking hard negative mining in graph contrastive learning. arXiv preprint arXiv:2110.02027 (2021)"},{"key":"30_CR45","doi-asserted-by":"publisher","unstructured":"Xu, J., De\u00a0Mello, S., Liu, S., Byeon, W., Breuel, T., Kautz, J., Wang, X.: GroupViT: Semantic Segmentation Emerges from Text Supervision (Jul 2022https:\/\/doi.org\/10.48550\/arXiv.2202.11094","DOI":"10.48550\/arXiv.2202.11094"},{"key":"30_CR46","doi-asserted-by":"publisher","unstructured":"Yang, L., Song, Y., Ren, X., Lyu, C., Wang, Y., Zhuo, J., Liu, L., Wang, J., Foster, J., Zhang, Y.: Out-of-distribution generalization in natural language processing: Past, present, and future. In: Bouamor, H., Pino, J., Bali, K. (eds.) Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing. pp. 4533\u20134559. Association for Computational Linguistics, Singapore (Dec 2023https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.276, https:\/\/aclanthology.org\/2023.emnlp-main.276","DOI":"10.18653\/v1\/2023.emnlp-main.276"},{"key":"30_CR47","doi-asserted-by":"publisher","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics 2, 67\u201378 (2014https:\/\/doi.org\/10.1162\/tacl_a_00166","DOI":"10.1162\/tacl_a_00166"},{"key":"30_CR48","unstructured":"Zeng, Y., Zhang, X., Li, H.: Multi-grained vision language pre-training: Aligning texts with visual concepts. arXiv preprint arXiv:2111.08276 (2021)"},{"key":"30_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Lei, Z., Zhang, Z., Li, S.Z.: Context-aware attention network for image-text retrieval. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 3536\u20133545 (2020)","DOI":"10.1109\/CVPR42600.2020.00359"},{"key":"30_CR50","doi-asserted-by":"crossref","unstructured":"Zhen, R., Song, W., He, Q., Cao, J., Shi, L., Luo, J.: Human-computer interaction system: A survey of talking-head generation. Electronics 12(1), \u00a0218 (2023)","DOI":"10.3390\/electronics12010218"}],"container-title":["Lecture Notes in Computer Science","Database Systems for Advanced Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-5555-4_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,11]],"date-time":"2025-01-11T06:08:27Z","timestamp":1736575707000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-5555-4_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819755547","9789819755554"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-5555-4_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DASFAA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Database Systems for Advanced Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Gifu","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 July 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 July 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dasfaa2024a","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.dasfaa2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}