{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T12:19:32Z","timestamp":1743077972812,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":33,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819984282"},{"type":"electronic","value":"9789819984299"}],"license":[{"start":{"date-parts":[[2023,12,24]],"date-time":"2023-12-24T00:00:00Z","timestamp":1703376000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,24]],"date-time":"2023-12-24T00:00:00Z","timestamp":1703376000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-99-8429-9_17","type":"book-chapter","created":{"date-parts":[[2023,12,23]],"date-time":"2023-12-23T08:02:17Z","timestamp":1703318537000},"page":"210-221","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal Causal Relations Enhanced CLIP for\u00a0Image-to-Text Retrieval"],"prefix":"10.1007","author":[{"given":"Wenjun","family":"Feng","sequence":"first","affiliation":[]},{"given":"Dazhen","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Donglin","family":"Cao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,24]]},"reference":[{"key":"17_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10578\u201310587 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Diao, H., Zhang, Y., Ma, L., Lu, H.: Similarity reasoning and filtration for image-text matching. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, pp. 1218\u20131226 (2021)","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"17_CR4","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: VSE++: improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)"},{"key":"17_CR5","doi-asserted-by":"crossref","unstructured":"Feng, F., Zhang, J., He, X., Zhang, H., Chua, T.S.: Empowering language understanding with counterfactual reasoning. arXiv preprint arXiv:2106.03046 (2021)","DOI":"10.18653\/v1\/2021.findings-acl.196"},{"key":"17_CR6","unstructured":"Frome, A., et al.: DeViSE: a deep visual-semantic embedding model. In: Advances in Neural Information Processing Systems, vol. 26 (2013)"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1440\u20131448 (2015)","DOI":"10.1109\/ICCV.2015.169"},{"issue":"3","key":"17_CR8","first-page":"261","volume":"16","author":"FO Isinkaye","year":"2015","unstructured":"Isinkaye, F.O., Folajimi, Y.O., Ojokoh, B.A.: Recommendation systems: principles, methods and evaluation. Egypt. Inf. J. 16(3), 261\u2013273 (2015)","journal-title":"Egypt. Inf. J."},{"key":"17_CR9","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"17_CR10","unstructured":"Kim, W., Son, B., Kim, I.: ViLT: vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning, pp. 5583\u20135594. PMLR (2021)"},{"key":"17_CR11","unstructured":"Kipf, T.N., Welling, M.: Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 (2016)"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Landeiro, V., Culotta, A.: Robust text classification in the presence of confounding bias. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 30 (2016)","DOI":"10.1609\/aaai.v30i1.9997"},{"key":"17_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"212","DOI":"10.1007\/978-3-030-01225-0_13","volume-title":"Computer Vision \u2013 ECCV 2018","author":"K-H Lee","year":"2018","unstructured":"Lee, K.-H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11208, pp. 212\u2013228. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01225-0_13"},{"key":"17_CR14","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-VL: a universal encoder for vision and language by cross-modal pre-training. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 11336\u201311344 (2020)","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"17_CR15","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"17_CR16","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"17_CR17","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"17_CR18","doi-asserted-by":"crossref","unstructured":"Li, L., Gan, Z., Cheng, Y., Liu, J.: Relation-aware graph attention network for visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10313\u201310322 (2019)","DOI":"10.1109\/ICCV.2019.01041"},{"key":"17_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"17_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Liu, X., Yin, D., Feng, Y., Wu, Y., Zhao, D.: Everything has a cause: leveraging causal inference in legal text analysis. arXiv preprint arXiv:2104.09420 (2021)","DOI":"10.18653\/v1\/2021.naacl-main.155"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Loper, E., Bird, S.: NLTK: the natural language toolkit. arXiv preprint cs\/0205028 (2002)","DOI":"10.3115\/1118108.1118117"},{"key":"17_CR23","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Messina, N., et al.: ALADIN: distilling fine-grained alignment scores for efficient image-text matching and retrieval. In: Proceedings of the 19th International Conference on Content-Based Multimedia Indexing, pp. 64\u201370 (2022)","DOI":"10.1145\/3549555.3549576"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Nan, G., Zeng, J., Qiao, R., Guo, Z., Lu, W.: Uncovering main causalities for long-tailed information extraction. arXiv preprint arXiv:2109.05213 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.763"},{"key":"17_CR26","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: GloVe: global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"17_CR27","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"17_CR28","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/s41060-016-0032-z","volume":"3","author":"J Ramsey","year":"2017","unstructured":"Ramsey, J., Glymour, M., Sanchez-Romero, R., Glymour, C.: A million variables and more: the fast greedy equivalence search algorithm for learning high-dimensional graphical causal models, with an application to functional magnetic resonance images. Int. J. Data Sci. Anal. 3, 121\u2013129 (2017)","journal-title":"Int. J. Data Sci. Anal."},{"key":"17_CR29","doi-asserted-by":"crossref","unstructured":"Sharma, D., Shukla, R., Giri, A.K., Kumar, S.: A brief review on search engine optimization. In: 2019 9th International Conference on Cloud Computing, Data Science & Engineering (Confluence), pp. 687\u2013692. IEEE (2019)","DOI":"10.1109\/CONFLUENCE.2019.8776976"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Tang, K., Niu, Y., Huang, J., Shi, J., Zhang, H.: Unbiased scene graph generation from biased training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3716\u20133725 (2020)","DOI":"10.1109\/CVPR42600.2020.00377"},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Wang, Y., Liang, D., Charlin, L., Blei, D.M.: Causal inference for recommender systems. In: Proceedings of the 14th ACM Conference on Recommender Systems, pp. 426\u2013431 (2020)","DOI":"10.1145\/3383313.3412225"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Yang, X., Zhang, H., Cai, J.: Deconfounded image captioning: a causal retrospect. IEEE Trans. Pattern Anal. Mach. Intell. (2021)","DOI":"10.1109\/TPAMI.2021.3121705"},{"key":"17_CR33","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-99-8429-9_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,23]],"date-time":"2023-12-23T08:18:56Z","timestamp":1703319536000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-99-8429-9_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,24]]},"ISBN":["9789819984282","9789819984299"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-981-99-8429-9_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023,12,24]]},"assertion":[{"value":"24 December 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Xiamen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/prcv2023.xmu.edu.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Microsoft CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1420","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"532","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"37% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3,78","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3,69","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}