{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:48:21Z","timestamp":1775580501292,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":76,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612371","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"86-95","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["MM-AU:Towards Multimodal Understanding of Advertisement Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5281-1695","authenticated-orcid":false,"given":"Digbalay","family":"Bose","sequence":"first","affiliation":[{"name":"University of Southern California, Los Angeles, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0904-0573","authenticated-orcid":false,"given":"Rajat","family":"Hebbar","sequence":"additional","affiliation":[{"name":"University of Southern California, Los Angeles, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2053-9068","authenticated-orcid":false,"given":"Tiantian","family":"Feng","sequence":"additional","affiliation":[{"name":"University of Southern California, Los Angeles, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2845-1079","authenticated-orcid":false,"given":"Krishna","family":"Somandepalli","sequence":"additional","affiliation":[{"name":"Google Research, New York, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5780-9797","authenticated-orcid":false,"given":"Anfeng","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Southern California, Los Angeles, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1052-6204","authenticated-orcid":false,"given":"Shrikanth","family":"Narayanan","sequence":"additional","affiliation":[{"name":"University of Southern California, Los Angeles, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"YouTube-8M: A Large-Scale Video Classification Benchmark. ArXiv","author":"Abu-El-Haija Sami","year":"2016","unstructured":"Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Apostol Natsev, George Toderici, Balakrishnan Varadarajan, and Sudheendra Vijayanarasimhan. 2016. YouTube-8M: A Large-Scale Video Classification Benchmark. ArXiv, Vol. abs\/1609.08675 (2016)."},{"key":"e_1_3_2_1_2_1","volume-title":"VATT: Transformers for Multimodal Self-Supervised Learning from Raw Video, Audio and Text. In Neural Information Processing Systems.","author":"Akbari Hassan","year":"2021","unstructured":"Hassan Akbari, Linagzhe Yuan, Rui Qian, Wei-Hong Chuang, Shih-Fu Chang, Yin Cui, and Boqing Gong. 2021. VATT: Transformers for Multimodal Self-Supervised Learning from Raw Video, Audio and Text. In Neural Information Processing Systems."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the Asian Conference on Computer Vision.","author":"Bain Max","year":"2020","unstructured":"Max Bain, Arsha Nagrani, Andrew Brown, and Andrew Zisserman. 2020. Condensed movies: Story based retrieval with contextual embeddings. In Proceedings of the Asian Conference on Computer Vision."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2015.2396531"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00212"},{"key":"e_1_3_2_1_6_1","volume-title":"Pennebaker","author":"Boyd Ryan L.","year":"2020","unstructured":"Ryan L. Boyd, Kate G. Blackburn, and James W. Pennebaker. 2020. The narrative arc: Revealing core narrative structures through text analysis. Science Advances, Vol. 6 (2020)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1080\/10646175.2019.1666068"},{"key":"e_1_3_2_1_8_1","unstructured":"Cannes. 2017. Cannes Lions. https:\/\/www.canneslions.com\/"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_10_1","unstructured":"Hyung Won Chung Le Hou S. Longpre et al. 2022. Scaling Instruction-Finetuned Language Models. ArXiv Vol. abs\/2210.11416 (2022)."},{"key":"e_1_3_2_1_11_1","volume-title":"Social Issues in America: An Encyclopedia. M.E","author":"Ciment James","unstructured":"James Ciment. 2006. Social Issues in America: An Encyclopedia. M.E. Sharpe, Armonk, NY."},{"key":"e_1_3_2_1_12_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. ArXiv","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. ArXiv, Vol. abs\/1810.04805 (2019)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_35"},{"key":"e_1_3_2_1_14_1","unstructured":"Ellen Douglas-Cowie Roddy Cowie Ian Sneddon Cate Cox Orla Lowry Margaret McRorie Jean-Claude Martin Laurence Devillers Sarkis Abrilian Anton Batliner Noam Amir and Kostas Karpouzis. 2007. The HUMAINE Database: Addressing the Collection and Annotation of Naturalistic and Induced Emotional Data. In Affective Computing and Intelligent Interaction."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.4324\/9780203380260_chapter_9"},{"key":"e_1_3_2_1_16_1","unstructured":"Walter R. Fisher. 1987. Human Communication As Narration: Toward a Philosophy of Reason Value and Action."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"e_1_3_2_1_19_1","unstructured":"Google. [n. d.]. Diversity and inclusion in advertisement videos. https:\/\/www.thinkwithgoogle.com\/feature\/diversity-inclusion\/?vertical=All"},{"key":"e_1_3_2_1_20_1","volume-title":"AVA: A Video Dataset of Spatio-Temporally Localized Atomic Visual Actions. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Gu Chunhui","year":"2017","unstructured":"Chunhui Gu, Chen Sun, Sudheendra Vijayanarasimhan, Caroline Pantofaru, David A. Ross, George Toderici, Yeqing Li, Susanna Ricco, Rahul Sukthankar, Cordelia Schmid, and Jitendra Malik. 2017. AVA: A Video Dataset of Spatio-Temporally Localized Atomic Visual Actions. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2017), 6047--6056."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02039"},{"key":"e_1_3_2_1_22_1","volume-title":"Narayanan","author":"Hebbar Rajat","year":"2023","unstructured":"Rajat Hebbar, Digbalay Bose, Krishna Somandepalli, Veena Vijai, and Shrikanth S. Narayanan. 2023. A dataset for Audio-Visual Sound Event Detection in Movies. ArXiv, Vol. abs\/2302.07315 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1002\/mar.4220010206"},{"key":"e_1_3_2_1_26_1","volume-title":"Movienet: A holistic dataset for movie understanding. In Computer Vision--ECCV 2020: 16th European Conference","author":"Huang Qingqiu","year":"2020","unstructured":"Qingqiu Huang, Yu Xiong, Anyi Rao, Jiaze Wang, and Dahua Lin. 2020. Movienet: A holistic dataset for movie understanding. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part IV 16. Springer, 709--727."},{"key":"e_1_3_2_1_27_1","volume-title":"Automatic Understanding of Image and Video Advertisements. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Hussain Zaeem","year":"2017","unstructured":"Zaeem Hussain, Mingda Zhang, Xiaozhong Zhang, Keren Ye, Christopher Thomas, Zuha Agha, Nathan Ong, and Adriana Kovashka. 2017. Automatic Understanding of Image and Video Advertisements. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017), 1100--1110."},{"key":"e_1_3_2_1_28_1","volume-title":"Xian Li, Brian O'Horo, Gabriel Pereyra, Jeff Wang, Christopher Dewan, Asli Celikyilmaz, Luke Zettlemoyer, and Veselin Stoyanov.","author":"Iyer Srinivas","year":"2022","unstructured":"Srinivas Iyer, Xiaojuan Lin, Ramakanth Pasunuru, Todor Mihaylov, Daniel Simig, Ping Yu, Kurt Shuster, Tianlu Wang, Qing Liu, Punit Singh Koura, Xian Li, Brian O'Horo, Gabriel Pereyra, Jeff Wang, Christopher Dewan, Asli Celikyilmaz, Luke Zettlemoyer, and Veselin Stoyanov. 2022. OPT-IML: Scaling Language Model Instruction Meta Learning through the Lens of Generalization. ArXiv, Vol. abs\/2212.12017 (2022)."},{"key":"e_1_3_2_1_29_1","volume-title":"Perceiver IO: A General Architecture for Structured Inputs & Outputs. ArXiv","author":"Jaegle Andrew","year":"2021","unstructured":"Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Andrew Brock, Evan Shelhamer, Olivier J. H'enaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, and Jo ao Carreira. 2021. Perceiver IO: A General Architecture for Structured Inputs & Outputs. ArXiv, Vol. abs\/2107.14795 (2021)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3227425"},{"key":"e_1_3_2_1_31_1","volume-title":"Predicting Emotions in User-Generated Videos. In AAAI Conference on Artificial Intelligence.","author":"Jiang Yu-Gang","unstructured":"Yu-Gang Jiang, Baohan Xu, and X. Xue. 2014. Predicting Emotions in User-Generated Videos. In AAAI Conference on Artificial Intelligence."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1080\/00913367.2016.1268984"},{"key":"e_1_3_2_1_33_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/T-AFFC.2011.15"},{"key":"e_1_3_2_1_35_1","volume-title":"Swee Hoon Ang, and Lynn Heng","author":"Leong Siew Meng","year":"1994","unstructured":"Siew Meng Leong, Swee Hoon Ang, and Lynn Heng. 1994. Using Drama to Persuade: the Effects of Involvement and Ad Form on Persuasion. ACR Asia-Pacific Advances (1994)."},{"key":"e_1_3_2_1_36_1","volume-title":"Annotating High-Level Structures of Short Stories and Personal Anecdotes. ArXiv","author":"Li Boyang Albert","year":"2017","unstructured":"Boyang Albert Li, Beth Cardier, Tong Wang, and Florian Metze. 2017. Annotating High-Level Structures of Short Stories and Personal Anecdotes. ArXiv, Vol. abs\/1710.06917 (2017)."},{"key":"e_1_3_2_1_37_1","volume-title":"Foundations and Recent Trends in Multimodal Machine Learning: Principles, Challenges, and Open Questions. ArXiv","author":"Liang Paul Pu","year":"2022","unstructured":"Paul Pu Liang, Amir Zadeh, and Louis-Philippe Morency. 2022. Foundations and Recent Trends in Multimodal Machine Learning: Principles, Challenges, and Open Questions. ArXiv, Vol. abs\/2209.03430 (2022)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jbusres.2011.12.016"},{"key":"e_1_3_2_1_39_1","volume-title":"ECCV Workshops.","author":"Lin Rongcheng","year":"2018","unstructured":"Rongcheng Lin, Jing Xiao, and Jianping Fan. 2018. NeXtVLAD: An Efficient Neural Network to Aggregate Frame-level Features for Large-scale Video Classification. In ECCV Workshops."},{"key":"e_1_3_2_1_40_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled Weight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2014.2384198"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"David Mick. 1987. Toward a Semiotic of Advertising Story Grammars.","DOI":"10.1515\/9783110853254.249"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.2501\/S0021849910091300"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2901464"},{"key":"e_1_3_2_1_45_1","first-page":"14200","article-title":"Attention bottlenecks for multimodal fusion","volume":"34","author":"Nagrani Arsha","year":"2021","unstructured":"Arsha Nagrani, Shan Yang, Anurag Arnab, Aren Jansen, Cordelia Schmid, and Chen Sun. 2021. Attention bottlenecks for multimodal fusion. Advances in Neural Information Processing Systems, Vol. 34 (2021), 14200--14213.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_46_1","volume-title":"Media advertising spending in the United States from 2020 to","author":"Navarro Jos\u00e9 Gabriel","year":"2024","unstructured":"Jos\u00e9 Gabriel Navarro. 2023. Media advertising spending in the United States from 2020 to 2024. https:\/\/www.statista.com\/statistics\/272314\/advertising-spending-in-the-us\/#: :text=According%20to%20market%20estimates%2C%20total,grow%20to%20322%20billion%20dollars.. ."},{"key":"e_1_3_2_1_47_1","unstructured":"Ads of the World. [n. d.]. Ads of the World. https:\/\/www.adsoftheworld.com\/"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2019.2955949"},{"key":"e_1_3_2_1_49_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. ArXiv Vol. abs\/2303.08774 (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1257"},{"key":"e_1_3_2_1_51_1","volume-title":"MovieCuts: A New Dataset and Benchmark for Cut Type Recognition. In European Conference on Computer Vision.","author":"Pardo A.","year":"2021","unstructured":"A. Pardo, Fabian Caba Heilbron, Juan Le'on Alc'azar, Ali K. Thabet, and Bernard Ghanem. 2021. MovieCuts: A New Dataset and Benchmark for Cut Type Recognition. In European Conference on Computer Vision."},{"key":"e_1_3_2_1_52_1","volume-title":"Advertising and Society : an Introduction","author":"Pardun J Carol","unstructured":"J Carol Pardun. 2013. Advertising and Society : an Introduction. John Wiley & Sons, Inc., New York, NY, USA."},{"key":"e_1_3_2_1_53_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Neural Information Processing Systems."},{"key":"e_1_3_2_1_54_1","volume-title":"Wells","author":"Puto Christopher P.","year":"1984","unstructured":"Christopher P. Puto and William D. Wells. 1984. Informational and Transformational Advertising: the Differential Effects of Time. ACR North American Advances (1984)."},{"key":"e_1_3_2_1_55_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_56_1","volume-title":"Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever.","author":"Radford Alec","year":"2022","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2022. Robust Speech Recognition via Large-Scale Weak Supervision. ArXiv, Vol. abs\/2212.04356 (2022)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2020.2964549"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123444"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3136796"},{"key":"e_1_3_2_1_60_1","volume-title":"Arunim Gupta, Milan Aggarwal, Aditya Garg, Ayushi Bhardwaj, Tushar, Balaji Krishnamurthy, Rajiv Ratn Shah, and Changyou Chen.","author":"Singla Yaman Kumar","year":"2022","unstructured":"Yaman Kumar Singla, Rajat Aayush Jha, Arunim Gupta, Milan Aggarwal, Aditya Garg, Ayushi Bhardwaj, Tushar, Balaji Krishnamurthy, Rajiv Ratn Shah, and Changyou Chen. 2022. Persuasion Strategies in Advertisements: Dataset, Modeling, and Baselines. ArXiv, Vol. abs\/2208.09626 (2022)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00497"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2020.3047978"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3243026"},{"key":"e_1_3_2_1_64_1","volume-title":"EEV Dataset: Predicting Expressions Evoked by Diverse Videos. ArXiv","author":"Sun Jennifer J.","year":"2020","unstructured":"Jennifer J. Sun, Ting Liu, Alan S. Cowen, Florian Schroff, Hartwig Adam, and Gautam Prasad. 2020. EEV Dataset: Predicting Expressions Evoked by Diverse Videos. ArXiv, Vol. abs\/2001.05488 (2020)."},{"key":"e_1_3_2_1_65_1","volume-title":"Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca.","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, et al. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1287\/mksc.2014.0854"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_68_1","volume-title":"Multimodal Content Analysis for Effective Advertisements on YouTube. 2017 IEEE International Conference on Data Mining (ICDM)","author":"Vedula Nikhita","year":"2017","unstructured":"Nikhita Vedula, Wei Sun, Hyunhwan Lee, Harsh Gupta, Mitsunori Ogihara, Joseph Johnson, Gang Ren, and Srinivasan Parthasarathy. 2017. Multimodal Content Analysis for Effective Advertisements on YouTube. 2017 IEEE International Conference on Data Mining (ICDM) (2017), 1123--1128."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1080\/02650487.2008.11073051"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479202"},{"key":"e_1_3_2_1_71_1","volume-title":"Story Understanding in Video Advertisements. In British Machine Vision Conference.","author":"Ye Keren","year":"2018","unstructured":"Keren Ye, Kyle Buettner, and Adriana Kovashka. 2018. Story Understanding in Video Advertisements. In British Machine Vision Conference."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2947440"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413582"},{"key":"e_1_3_2_1_74_1","volume-title":"Attract me to Buy: Advertisement Copywriting Generation with Multimodal Multi-structured Information. ArXiv","author":"Zhang Zhipeng","year":"2022","unstructured":"Zhipeng Zhang, Xinglin Hou, Kai Niu, Zhongzhen Huang, Tiezheng Ge, Yuning Jiang, Qi Wu, and Peifeng Wang. 2022. Attract me to Buy: Advertisement Copywriting Generation with Multimodal Multi-structured Information. ArXiv, Vol. abs\/2205.03534 (2022)."},{"key":"e_1_3_2_1_75_1","unstructured":"Wayne Xin Zhao Kun Zhou Junyi Li et al. 2023. A Survey of Large Language Models. ArXiv Vol. abs\/2303.18223 (2023)."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1186\/s13640-017-0194-1"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612371","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612371","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:54:56Z","timestamp":1755820496000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612371"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":76,"alternative-id":["10.1145\/3581783.3612371","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612371","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}