{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T14:32:33Z","timestamp":1742913153918,"version":"3.40.3"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031466632"},{"type":"electronic","value":"9783031466649"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-46664-9_42","type":"book-chapter","created":{"date-parts":[[2023,11,4]],"date-time":"2023-11-04T13:02:29Z","timestamp":1699102949000},"page":"629-643","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-head Similarity Feature Representation and\u00a0Filtration for\u00a0Image-Text Matching"],"prefix":"10.1007","author":[{"given":"Mengqi","family":"Jiang","sequence":"first","affiliation":[]},{"given":"Shichao","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Debo","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Leyuan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Guixian","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,11,5]]},"reference":[{"key":"42_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"42_CR2","doi-asserted-by":"crossref","unstructured":"Chen, H., Ding, G., Liu, X., Lin, Z., Liu, J., Han, J.: Imram: Iterative matching with recurrent attention memory for cross-modal image-text retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12655\u201312663 (2020)","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"42_CR3","doi-asserted-by":"crossref","unstructured":"Chen, J., Hu, H., Wu, H., Jiang, Y., Wang, C.: Learning the best pooling strategy for visual semantic embedding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15789\u201315798 (2021)","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"42_CR4","doi-asserted-by":"publisher","first-page":"285","DOI":"10.1007\/s00530-015-0487-0","volume":"23","author":"D Cheng","year":"2017","unstructured":"Cheng, D., Zhang, S., Liu, X., Sun, K., Zong, M.: Feature selection by combining subspace learning with sparse representation. Multimedia Syst. 23, 285\u2013291 (2017)","journal-title":"Multimedia Syst."},{"issue":"17","key":"42_CR5","doi-asserted-by":"publisher","first-page":"23615","DOI":"10.1007\/s11042-022-12444-8","volume":"81","author":"Z Cui","year":"2022","unstructured":"Cui, Z., Hu, Y., Sun, Y., Gao, J., Yin, B.: Cross-modal alignment with graph reasoning for image-text retrieval. Multimed. Tools Appl. 81(17), 23615\u201323632 (2022)","journal-title":"Multimed. Tools Appl."},{"key":"42_CR6","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: VSE++: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)"},{"key":"42_CR7","doi-asserted-by":"publisher","first-page":"17479","DOI":"10.1007\/s11042-016-4119-2","volume":"76","author":"R Hu","year":"2017","unstructured":"Hu, R., et al.: Low-rank feature selection for multi-view regression. Multimed. Tools Appl. 76, 17479\u201317495 (2017)","journal-title":"Multimed. Tools Appl."},{"issue":"4","key":"42_CR8","doi-asserted-by":"publisher","first-page":"2008","DOI":"10.1109\/TIP.2018.2882225","volume":"28","author":"F Huang","year":"2018","unstructured":"Huang, F., Zhang, X., Zhao, Z., Li, Z.: Bi-directional spatial-semantic attention networks for image-text matching. IEEE Trans. Image Process. 28(4), 2008\u20132020 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"42_CR9","doi-asserted-by":"crossref","unstructured":"Ji, Z., Chen, K., Wang, H.: Step-wise hierarchical alignment network for image-text matching. arXiv preprint arXiv:2106.06509 (2021)","DOI":"10.24963\/ijcai.2021\/106"},{"key":"42_CR10","unstructured":"Kalibhat, N.M., Narang, K., Tan, L., Firooz, H., Sanjabi, M., Feizi, S.: Understanding failure modes of self-supervised learning. arXiv preprint arXiv:2203.01881 (2022)"},{"key":"42_CR11","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014)"},{"key":"42_CR12","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"42_CR13","unstructured":"Lee, J., et al.: Uniclip: Unified framework for contrastive language-image pre-training. In: 36th Conference on Neural Information Processing Systems, NeurIPS 2022. Neural information processing systems foundation (2022)"},{"key":"42_CR14","doi-asserted-by":"crossref","unstructured":"Lee, K.H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 201\u2013216 (2018)","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"42_CR15","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE\/CVF International Conference On Computer Vision, pp. 4654\u20134662 (2019)","DOI":"10.1109\/ICCV.2019.00475"},{"key":"42_CR16","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) Computer Vision \u2013 ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"42_CR17","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, Z., Liu, A.A., Zhang, T., Wang, B., Zhang, Y.: Focus your attention: a bidirectional focal attention network for image-text matching. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 3\u201311 (2019)","DOI":"10.1145\/3343031.3350869"},{"key":"42_CR18","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, Z., Zhang, T., Xie, H., Wang, B., Zhang, Y.: Graph structured network for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10921\u201310930 (2020)","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"42_CR19","doi-asserted-by":"crossref","unstructured":"Liu, Y., Guo, Y., Bakker, E.M., Lew, M.S.: Learning a recurrent residual fusion network for multimodal matching. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4107\u20134116 (2017)","DOI":"10.1109\/ICCV.2017.442"},{"key":"42_CR20","doi-asserted-by":"crossref","unstructured":"Manning, C.D., Surdeanu, M., Bauer, J., Finkel, J.R., Bethard, S., McClosky, D.: The stanford corenlp natural language processing toolkit. In: Proceedings of 52nd Annual Meeting of the Association for Computational Linguistics: System Demonstrations, pp. 55\u201360 (2014)","DOI":"10.3115\/v1\/P14-5010"},{"key":"42_CR21","doi-asserted-by":"crossref","unstructured":"Nam, H., Ha, J.W., Kim, J.: Dual attention networks for multimodal reasoning and matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 299\u2013307 (2017)","DOI":"10.1109\/CVPR.2017.232"},{"key":"42_CR22","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"42_CR23","doi-asserted-by":"crossref","unstructured":"Qu, L., Liu, M., Cao, D., Nie, L., Tian, Q.: Context-aware multi-view summarization network for image-text matching. In: Proceedings of the 28th ACM International Conference On Multimedia, pp. 1047\u20131055 (2020)","DOI":"10.1145\/3394171.3413961"},{"key":"42_CR24","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: Towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems 28 (2015)"},{"issue":"11","key":"42_CR25","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster, M., Paliwal, K.K.: Bidirectional recurrent neural networks. IEEE Trans. Signal Process. 45(11), 2673\u20132681 (1997)","journal-title":"IEEE Trans. Signal Process."},{"key":"42_CR26","unstructured":"Velickovic, P., Cucurull, G., Casanova, A., Romero, A., Lio, P., Bengio, Y., et al.: Graph attention networks. stat 1050(20), 10\u201348550 (2017)"},{"key":"42_CR27","unstructured":"Vendrov, I., Kiros, R., Fidler, S., Urtasun, R.: Order-embeddings of images and language. arXiv preprint arXiv:1511.06361 (2015)"},{"key":"42_CR28","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5005\u20135013 (2016)","DOI":"10.1109\/CVPR.2016.541"},{"key":"42_CR29","doi-asserted-by":"crossref","unstructured":"Wang, S., Wang, R., Yao, Z., Shan, S., Chen, X.: Cross-modal scene graph matching for relationship-aware image-text retrieval. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1508\u20131517 (2020)","DOI":"10.1109\/WACV45572.2020.9093614"},{"key":"42_CR30","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Position focused attention network for image-text matching. In: Proceedings of the 28th International Joint Conference on Artificial Intelligence, pp. 3792\u20133798 (2019)","DOI":"10.24963\/ijcai.2019\/526"},{"key":"42_CR31","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Wasserstein coupled graph learning for cross-modal retrieval. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1793\u20131802. IEEE (2021)","DOI":"10.1109\/ICCV48922.2021.00183"},{"key":"42_CR32","doi-asserted-by":"crossref","unstructured":"Wei, X., Zhang, T., Li, Y., Zhang, Y., Wu, F.: Multi-modality cross attention network for image and sentence matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10941\u201310950 (2020)","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"42_CR33","doi-asserted-by":"crossref","unstructured":"Yu, R., Jin, F., Qiao, Z., Yuan, Y., Wang, G.: Multi-scale image-text matching network for scene and spatio-temporal images. Future Gen. Comput. Syst. 142, 292\u2013300 (2023)","DOI":"10.1016\/j.future.2023.01.004"},{"key":"42_CR34","doi-asserted-by":"publisher","first-page":"17461","DOI":"10.1007\/s11042-016-3980-3","volume":"76","author":"S Zhang","year":"2017","unstructured":"Zhang, S., Yang, L., Deng, Z., Cheng, D., Li, Y.: Leverage triple relational structures via low-rank feature reduction for multi-output regression. Multimed. Tools Appl. 76, 17461\u201317477 (2017)","journal-title":"Multimed. Tools Appl."},{"key":"42_CR35","doi-asserted-by":"crossref","unstructured":"Zhu, L., Xu, Z., Yang, Y.: Bidirectional multirate reconstruction for temporal modeling in videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2653\u20132662 (2017)","DOI":"10.1109\/CVPR.2017.147"}],"container-title":["Lecture Notes in Computer Science","Advanced Data Mining and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-46664-9_42","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,4]],"date-time":"2023-11-04T13:14:37Z","timestamp":1699103677000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-46664-9_42"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031466632","9783031466649"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-46664-9_42","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"5 November 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ADMA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Advanced Data Mining and Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shenyang","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 August 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 August 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"adma2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/adma2023.uqcloud.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes. Microsoft CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"503","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"216","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"43% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.97","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.77","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}