{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T14:00:22Z","timestamp":1758981622128,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":19,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819722648"},{"type":"electronic","value":"9789819722624"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-2262-4_2","type":"book-chapter","created":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T09:02:31Z","timestamp":1713949351000},"page":"15-27","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Geometrically-Aware Dual Transformer Encoding Visual and\u00a0Textual Features for\u00a0Image Captioning"],"prefix":"10.1007","author":[{"given":"Yu-Ling","family":"Chang","sequence":"first","affiliation":[]},{"given":"Hao-Shang","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Shiou-Chi","family":"Li","sequence":"additional","affiliation":[]},{"given":"Jen-Wei","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,4,25]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR (2018)","key":"2_CR1","DOI":"10.1109\/CVPR.2018.00636"},{"doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","key":"2_CR2","DOI":"10.1109\/CVPR.2018.00636"},{"unstructured":"Chen, X., et al.: Microsoft coco captions: data collection and evaluation server. CoRR (2015)","key":"2_CR3"},{"doi-asserted-by":"crossref","unstructured":"Cornia, M., Baraldi, L., Cucchiara, R.: Show, control and tell: a framework for generating controllable and grounded captions. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8299\u20138308 (2019)","key":"2_CR4","DOI":"10.1109\/CVPR.2019.00850"},{"doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10575\u201310584","key":"2_CR5","DOI":"10.1109\/CVPR42600.2020.01059"},{"doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Zhu, X., Yao, P., Lu, S., Lu, H.: Normalized and geometry-aware self-attention network for image captioning. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10324\u201310333 (2020)","key":"2_CR6","DOI":"10.1109\/CVPR42600.2020.01034"},{"doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.Y.: Attention on attention for image captioning. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 4633\u20134642 (2019)","key":"2_CR7","DOI":"10.1109\/ICCV.2019.00473"},{"doi-asserted-by":"crossref","unstructured":"Kuo, C., Kira, Z.: Beyond a pre-trained object detector: cross-modal textual and visual context for image captioning. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 17948\u201317958 (2022)","key":"2_CR8","DOI":"10.1109\/CVPR52688.2022.01744"},{"key":"2_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: OSCAR: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","key":"2_CR10","DOI":"10.1109\/ICCV48922.2021.00986"},{"doi-asserted-by":"crossref","unstructured":"Nguyen, V.Q., Suganuma, M., Okatani, T.: Grit: faster and better image captioning transformer using dual visual features, pp. 167\u2013184 (2022)","key":"2_CR11","DOI":"10.1007\/978-3-031-20059-5_10"},{"doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., Mei, T.: X-linear attention networks for image captioning. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10968\u201310977 (2020)","key":"2_CR12","DOI":"10.1109\/CVPR42600.2020.01098"},{"doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1179\u20131195 (2017)","key":"2_CR13","DOI":"10.1109\/CVPR.2017.131"},{"unstructured":"Vaswani, A., et al.: Attention is all you need. In: Guyon, I., et al. (eds.) Advances in Neural Information Processing Systems (2017)","key":"2_CR14"},{"unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: SimVLM: Simple visual language model pretraining with weak supervision. In: International Conference on Learning Representations (2022)","key":"2_CR15"},{"doi-asserted-by":"crossref","unstructured":"Yang, X., Tang, K., Zhang, H., Cai, J.: Auto-encoding scene graphs for image captioning. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10677\u201310686 (2019)","key":"2_CR16","DOI":"10.1109\/CVPR.2019.01094"},{"key":"2_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"711","DOI":"10.1007\/978-3-030-01264-9_42","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Ting Yao","year":"2018","unstructured":"Yao, Ting, Pan, Yingwei, Li, Yehao, Mei, Tao: Exploring visual relationship for image captioning. In: Ferrari, Vittorio, Hebert, Martial, Sminchisescu, Cristian, Weiss, Yair (eds.) Computer Vision \u2013 ECCV 2018. LNCS, vol. 11218, pp. 711\u2013727. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01264-9_42"},{"doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: VinVL: Revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5579\u20135588 (2021)","key":"2_CR18","DOI":"10.1109\/CVPR46437.2021.00553"},{"doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J.J., Gao, J.: Unified vision-language pre-training for image captioning and VQA. ArXiv (2020)","key":"2_CR19","DOI":"10.1609\/aaai.v34i07.7005"}],"container-title":["Lecture Notes in Computer Science","Advances in Knowledge Discovery and Data Mining"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-2262-4_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T09:17:08Z","timestamp":1713950228000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-2262-4_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819722648","9789819722624"],"references-count":19,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-2262-4_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"25 April 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PAKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific-Asia Conference on Knowledge Discovery and Data Mining","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taipei","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taiwan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 May 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 May 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pakdd2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/pakdd2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}