{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,26]],"date-time":"2025-06-26T11:46:00Z","timestamp":1750938360842},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T00:00:00Z","timestamp":1658102400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T00:00:00Z","timestamp":1658102400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,7,18]]},"DOI":"10.1109\/ijcnn55064.2022.9892884","type":"proceedings-article","created":{"date-parts":[[2022,9,30]],"date-time":"2022-09-30T15:56:04Z","timestamp":1664553364000},"page":"1-8","source":"Crossref","is-referenced-by-count":3,"title":["ICAF: Iterative Contrastive Alignment Framework for Multimodal Abstractive Summarization"],"prefix":"10.1109","author":[{"given":"Zijian","family":"Zhang","sequence":"first","affiliation":[{"name":"Meituan-Dianping Group,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chang","family":"Shu","sequence":"additional","affiliation":[{"name":"Ping An Technology (Shenzhen) Co., Ltd,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Youxin","family":"Chen","sequence":"additional","affiliation":[{"name":"Ping An Technology (Shenzhen) Co., Ltd,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jing","family":"Xiao","sequence":"additional","affiliation":[{"name":"Ping An Technology (Shenzhen) Co., Ltd,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qian","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Nottingham Ningbo China,Ningbo,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lu","family":"Zheng","sequence":"additional","affiliation":[{"name":"University of Nottingham Ningbo China,Ningbo,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.552"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00098"},{"key":"ref31","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","author":"lin","year":"2004","journal-title":"Text Summarization Branches Out"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1417"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i18.17971"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1175"},{"key":"ref35","first-page":"1073","article-title":"Get to the point: Summarization with pointer-generator networks","author":"abigail","year":"2017","journal-title":"ACL"},{"key":"ref34","first-page":"379","article-title":"Sumit Chopra, and Jason Weston, &#x201C;A neural attention model for abstractive sentence summarization","author":"rush","year":"2015","journal-title":"EMNLP"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6525"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9534082"},{"key":"ref12","first-page":"5655","article-title":"Multimodal sentence sum-marization via multimodal selective encoding","author":"li","year":"2020","journal-title":"COLING Barcelona Spain (Online)"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3445794"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref15","article-title":"How 2: a large-scale dataset for multimodal language understanding","author":"sanabria","year":"2018","journal-title":"ViGIL NeurIPS"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1448"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3347318.3355524"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2018.2848260"},{"key":"ref19","first-page":"1934","article-title":"Deep learning for entity matching: A design space exploration","author":"mudgal","year":"2018","journal-title":"SIGMOD &#x2018;10"},{"key":"ref28","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"3rd International Conference on Learning Representations ICLR 2015"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"ref27","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"ICLRE"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-2031"},{"journal-title":"Unified language model pre-training for natural language understanding and generation","year":"2019","author":"dong","key":"ref6"},{"key":"ref29","first-page":"577585","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"2015","journal-title":"Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 1"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6332"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.217"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1387"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.144"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00473"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.33"},{"journal-title":"VSE++ Improving Visual-Semantic Embeddings with Hard Negatives","year":"2018","author":"fartash","key":"ref21"},{"key":"ref24","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref23","article-title":"Represen-tation learning with contrastive predictive coding","author":"van den oord","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref26","first-page":"2286","article-title":"Convit: Improving vision transformers with soft convolutional inductive biases","author":"ascoli","year":"2021","journal-title":"International Conference on Machine Learning"},{"key":"ref25","first-page":"22","article-title":"Cvt: Introducing convolutions to vision trans-formers","author":"wu","year":"2021","journal-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision"}],"event":{"name":"2022 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2022,7,18]]},"location":"Padua, Italy","end":{"date-parts":[[2022,7,23]]}},"container-title":["2022 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9891857\/9889787\/09892884.pdf?arnumber=9892884","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,3]],"date-time":"2022-11-03T19:00:52Z","timestamp":1667502052000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9892884\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,7,18]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/ijcnn55064.2022.9892884","relation":{},"subject":[],"published":{"date-parts":[[2022,7,18]]}}}