{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,12,22]],"date-time":"2024-12-22T05:03:48Z","timestamp":1734843828677,"version":"3.32.0"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,10,18]],"date-time":"2024-10-18T00:00:00Z","timestamp":1729209600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,10,18]],"date-time":"2024-10-18T00:00:00Z","timestamp":1729209600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,10,18]]},"DOI":"10.1109\/mlnlp63328.2024.10800131","type":"proceedings-article","created":{"date-parts":[[2024,12,20]],"date-time":"2024-12-20T18:56:07Z","timestamp":1734720967000},"page":"1-9","source":"Crossref","is-referenced-by-count":0,"title":["AliSum: Multimodal Summarization with Multimodal Output Boosted by Multimodal Alignment"],"prefix":"10.1109","author":[{"given":"Renxin","family":"Xu","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}]},{"given":"Yongqi","family":"Shao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}]},{"given":"Zihan","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}]},{"given":"Shijie","family":"Yang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}]},{"given":"Tao","family":"Fang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}]},{"given":"Hong","family":"Huo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1438"},{"key":"ref2","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Devlin","year":"2019"},{"journal-title":"An image is worth 16x16 words: Transformers for image recognition at scale","year":"2020","author":"Dosovitskiy","key":"ref3"},{"volume-title":"Retag: Reasoning aware table to analytic text generation","year":"2023","author":"Ghosal","key":"ref4"},{"journal-title":"Distilling the knowledge in a neural network","year":"2015","author":"Hinton","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3584700"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.12"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/s10844-023-00812-1"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/577"},{"key":"ref11","first-page":"1092","article-title":"Multi-modal summarization for asynchronous collection of text, image, audio and video","volume-title":"Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing","author":"Li","year":"2017"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.752"},{"key":"ref13","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","volume-title":"Text Summarization Branches Out","author":"Lin","year":"2004"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1387"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.207"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1147\/rd.22.0159"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.10958"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02069"},{"key":"ref19","doi-asserted-by":"crossref","first-page":"1584","DOI":"10.18653\/v1\/2023.findings-acl.101","article-title":"SCCS: Semantics-consistent cross-domain summarization via optimal transport alignment","volume-title":"Findings of the Association for Computational Linguistics: ACL 2023","author":"Qiu","year":"2023"},{"key":"ref20","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford","year":"2021"},{"issue":"1","key":"ref21","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"The Journal of Machine Learning Research"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01258-8_22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01939"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-018-5749-3"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3590003.3590019"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612004"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2023.3296196"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_47"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21422"},{"key":"ref30","article-title":"Hierarchical cross-modality semantic correlation learning model for multimodal summarization","volume":"abs\/2112.12072","author":"Zhang","year":"2021","journal-title":"ArXiv"},{"key":"ref31","doi-asserted-by":"crossref","first-page":"779","DOI":"10.18653\/v1\/D18-1088","article-title":"Neural latent extractive document summarization","volume-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","author":"Zhang","year":"2018"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21431"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/IAECST57965.2022.10061897"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.552"},{"key":"ref35","doi-asserted-by":"crossref","DOI":"10.1609\/aaai.v32i1.12255","volume-title":"Deep reinforcement learning for unsupervised video summarization with diversity-representativeness reward","author":"Zhou","year":"2018"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1448"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6525"}],"event":{"name":"2024 7th International Conference on Machine Learning and Natural Language Processing (MLNLP)","start":{"date-parts":[[2024,10,18]]},"location":"Chengdu, China","end":{"date-parts":[[2024,10,20]]}},"container-title":["2024 7th International Conference on Machine Learning and Natural Language Processing (MLNLP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10799945\/10799990\/10800131.pdf?arnumber=10800131","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,21]],"date-time":"2024-12-21T05:57:52Z","timestamp":1734760672000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10800131\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,18]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/mlnlp63328.2024.10800131","relation":{},"subject":[],"published":{"date-parts":[[2024,10,18]]}}}