{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,29]],"date-time":"2025-09-29T20:11:22Z","timestamp":1759176682736,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3696409.3700234","type":"proceedings-article","created":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T09:55:23Z","timestamp":1735379723000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["SITransformer: Shared Information-Guided Transformer for Extreme Multimodal Summarization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-0137-6367","authenticated-orcid":false,"given":"Sicheng","family":"Liu","sequence":"first","affiliation":[{"name":"The University of Sydney, Darlington, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0698-4049","authenticated-orcid":false,"given":"Lintao","family":"Wang","sequence":"additional","affiliation":[{"name":"The University of Sydney, Darlington, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0647-4747","authenticated-orcid":false,"given":"Xiaogang","family":"Zhu","sequence":"additional","affiliation":[{"name":"The University of Adelaide, Adelaide, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0959-408X","authenticated-orcid":false,"given":"Xuequan","family":"Lu","sequence":"additional","affiliation":[{"name":"La Trobe University, Melbourne, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8043-0312","authenticated-orcid":false,"given":"Zhiyong","family":"Wang","sequence":"additional","affiliation":[{"name":"The University of Sydney, Darlington, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6891-8059","authenticated-orcid":false,"given":"Kun","family":"Hu","sequence":"additional","affiliation":[{"name":"The University of Sydney, Darlington, Australia"}]}],"member":"320","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1145\/3460426.3463630"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531404"},{"key":"e_1_3_3_1_4_2","first-page":"214","volume-title":"International conference on machine learning","author":"Arjovsky Martin","year":"2017","unstructured":"Martin Arjovsky, Soumith Chintala, and L\u00e9on Bottou. 2017. Wasserstein generative adversarial networks. In International conference on machine learning. PMLR, 214\u2013223. https:\/\/dl.acm.org\/doi\/10.5555\/3305381.3305404"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00238"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20887-5_35"},{"key":"e_1_3_3_1_7_2","unstructured":"Eric Jang Shixiang Gu and Ben Poole. 2016. Categorical reparameterization with gumbel-softmax. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1611.01144 (2016)."},{"key":"e_1_3_3_1_8_2","unstructured":"Hanlei Jin Yang Zhang Dan Meng Jun Wang and Jinghua Tan. 2024. A comprehensive survey on process-oriented automatic text summarization with exploration of llm-based methods. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.02901 (2024)."},{"key":"e_1_3_3_1_9_2","unstructured":"Katharina Kann Sascha Rothe and Katja Filippova. 2018. Sentence-level fluency evaluation: References help but can be spared! arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1809.08731 (2018)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/215206.215333"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Mike Lewis Yinhan Liu Naman Goyal Marjan Ghazvininejad Abdelrahman Mohamed Omer Levy Ves Stoyanov and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation translation and comprehension. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1910.13461 (2019).","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"e_1_3_3_1_12_2","first-page":"19730","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730\u201319742."},{"key":"e_1_3_3_1_13_2","unstructured":"Mingzhe Li Xiuying Chen Shen Gao Zhangming Chan Dongyan Zhao and Rui Yan. 2020. Vmsmo: Learning to generate multimodal summary for video-based news articles. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.05406 (2020)."},{"key":"e_1_3_3_1_14_2","first-page":"74","volume-title":"Text summarization branches out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Nayu Liu Xian Sun Hongfeng Yu Fanglong Yao Guangluan Xu and Kun Fu. 2022. Abstractive summarization for video: A revisit in multistage fusion network with forget gate. IEEE Transactions on Multimedia (2022).","DOI":"10.1109\/TMM.2022.3157993"},{"key":"e_1_3_3_1_16_2","unstructured":"Yang Liu. 2019. Fine-tune BERT for extractive summarization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1903.10318 (2019)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Yang Liu and Mirella Lapata. 2019. Text summarization with pretrained encoders. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1908.08345 (2019).","DOI":"10.18653\/v1\/D19-1387"},{"key":"e_1_3_3_1_18_2","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.05101 (2017)."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462959"},{"key":"e_1_3_3_1_20_2","unstructured":"Weizhen Qi Yu Yan Yeyun Gong Dayiheng Liu Nan Duan Jiusheng Chen Ruofei Zhang and Ming Zhou. 2020. Prophetnet: Predicting future n-gram for sequence-to-sequence pre-training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2001.04063 (2020)."},{"key":"e_1_3_3_1_21_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_1_23_2","unstructured":"Abigail See Peter\u00a0J Liu and Christopher\u00a0D Manning. 2017. Get to the point: Summarization with pointer-generator networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1704.04368 (2017)."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.229"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Peggy Tang Kun Hu Rui Yan Lei Zhang Junbin Gao and Zhiyong Wang. 2022. OTExtSum: extractive text summarisation with optimal transport. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.10086 (2022).","DOI":"10.18653\/v1\/2022.findings-naacl.85"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612004"},{"key":"e_1_3_3_1_27_2","unstructured":"Peggy Tang Kun Hu Lei Zhang Jiebo Luo and Zhiyong Wang. 2023. Tldw: Extreme multimodal summarisation of news videos. IEEE Transactions on Circuits and Systems for Video Technology (2023)."},{"key":"e_1_3_3_1_28_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096265"},{"key":"e_1_3_3_1_30_2","first-page":"11328","volume-title":"International conference on machine learning","author":"Zhang Jingqing","year":"2020","unstructured":"Jingqing Zhang, Yao Zhao, Mohammad Saleh, and Peter Liu. 2020. Pegasus: Pre-training with extracted gap-sentences for abstractive summarization. In International conference on machine learning. PMLR, 11328\u201311339."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_47"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21422"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Bin Zhao Maoguo Gong and Xuelong Li. 2022. Hierarchical multimodal transformer to summarize videos. Neurocomputing 468 (2022) 360\u2013369.","DOI":"10.1016\/j.neucom.2021.10.039"},{"key":"e_1_3_3_1_34_2","unstructured":"Kaizhi Zheng Xuehai He and Xin\u00a0Eric Wang. 2023. Minigpt-5: Interleaved vision-and-language generation via generative vokens. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.02239 (2023)."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1448"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6525"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Yubo Zhu Wentian Zhao Rui Hua and Xinxiao Wu. 2023. Topic-aware video summarization using multimodal transformer. Pattern Recognition 140 (2023) 109578.","DOI":"10.1016\/j.patcog.2023.109578"}],"event":{"name":"MMAsia '24: ACM Multimedia Asia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Auckland New Zealand","acronym":"MMAsia '24"},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700234","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696409.3700234","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:16Z","timestamp":1750295416000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700234"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":36,"alternative-id":["10.1145\/3696409.3700234","10.1145\/3696409"],"URL":"https:\/\/doi.org\/10.1145\/3696409.3700234","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}