{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:45:33Z","timestamp":1765309533253,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["62441225, 61972192, 62172208, 61906085"],"award-info":[{"award-number":["62441225, 61972192, 62172208, 61906085"]}]},{"name":"Fundamental Research Funds for the Central Universities","award":["4380001"],"award-info":[{"award-number":["4380001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3763784","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"14058-14065","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-Modal Prototype Augmentation and Dual-Grained Prompt Learning for Social Media Popularity Prediction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-6056-3515","authenticated-orcid":false,"given":"Ao","family":"Zhou","sequence":"first","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1679-0174","authenticated-orcid":false,"given":"Mingsheng","family":"Tu","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications, Chongqing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7920-4309","authenticated-orcid":false,"given":"Luping","family":"Wang","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications, Chongqing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6606-1732","authenticated-orcid":false,"given":"Tenghao","family":"Sun","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications, Chongqing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8486-2614","authenticated-orcid":false,"given":"Zifeng","family":"Cheng","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9497-6244","authenticated-orcid":false,"given":"Yafeng","family":"Yin","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5243-4992","authenticated-orcid":false,"given":"Zhiwei","family":"Jiang","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1112-790X","authenticated-orcid":false,"given":"Qing","family":"Gu","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-024-06727-4"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3356072"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551568"},{"key":"e_1_3_2_1_4_1","volume-title":"Multi-prompting decoder helps better language understanding. arXiv preprint arXiv:2406.06279","author":"Cheng Zifeng","year":"2024","unstructured":"Zifeng Cheng, Zhaoling Chen, Zhiwei Jiang, Yafeng Yin, Cong Wang, Shiping Ge, and Qing Gu. 2024. Multi-prompting decoder helps better language understanding. arXiv preprint arXiv:2406.06279 (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"Contrastive Prompting Enhances Sentence Embeddings in LLMs through Inference-Time Steering. arXiv preprint arXiv:2505.12831","author":"Cheng Zifeng","year":"2025","unstructured":"Zifeng Cheng, Zhonghui Wang, Yuchen Fu, Zhiwei Jiang, Yafeng Yin, Cong Wang, and Qing Gu. 2025. Contrastive Prompting Enhances Sentence Embeddings in LLMs through Inference-Time Steering. arXiv preprint arXiv:2505.12831 (2025)."},{"key":"e_1_3_2_1_6_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3356062"},{"key":"e_1_3_2_1_8_1","volume-title":"International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, G Heigold, S Gelly, et al., 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_9_1","volume-title":"Token prepending: A training-free approach for eliciting better sentence embeddings from llms. arXiv preprint arXiv:2412.11556","author":"Fu Yuchen","year":"2024","unstructured":"Yuchen Fu, Zifeng Cheng, Zhiwei Jiang, Zhonghui Wang, Yafeng Yin, Zhengliang Li, and Qing Gu. 2024. Token prepending: A training-free approach for eliciting better sentence embeddings from llms. arXiv preprint arXiv:2412.11556 (2024)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i3.32320"},{"key":"e_1_3_2_1_11_1","volume-title":"Fine-Grained Alignment Network for Zero-Shot Cross-Modal Retrieval. ACM Transactions on Multimedia Computing, Communications and Applications","author":"Ge Shiping","year":"2025","unstructured":"Shiping Ge, Zhiwei Jiang, Yafeng Yin, Cong Wang, Zifeng Cheng, and Qing Gu. 2025b. Fine-Grained Alignment Network for Zero-Shot Cross-Modal Retrieval. ACM Transactions on Multimedia Computing, Communications and Applications (2025)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00275"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2600428.2609558"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3266443"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3688998"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3266439"},{"key":"e_1_3_2_1_18_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Ke Guolin","year":"2017","unstructured":"Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A Highly Efficient Gradient Boosting Decision Tree. Advances in Neural Information Processing Systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539454"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3416273"},{"key":"e_1_3_2_1_21_1","first-page":"87874","article-title":"What matters when building vision-language models","volume":"37","author":"Lauren\u00e7on Hugo","year":"2024","unstructured":"Hugo Lauren\u00e7on, L\u00e9o Tronchon, Matthieu Cord, and Victor Sanh. 2024. What matters when building vision-language models? Advances in Neural Information Processing Systems, Vol. 37 (2024), 87874-87907.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_22_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3266438"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3266438"},{"key":"e_1_3_2_1_26_1","volume-title":"Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151","author":"Maji Subhransu","year":"2013","unstructured":"Subhransu Maji, Esa Rahtu, Juho Kannala, Matthew Blaschko, and Andrea Vedaldi. 2013. Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612839"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2872427.2883001"},{"key":"e_1_3_2_1_29_1","volume-title":"Anna Veronika Dorogush, and Andrey Gulin","author":"Prokhorenkova Liudmila","year":"2018","unstructured":"Liudmila Prokhorenkova, Gleb Gusev, Aleksandr Vorobev, Anna Veronika Dorogush, and Andrey Gulin. 2018. CatBoost: unbiased boosting with categorical features. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_31_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, Vol. 21, 140 (2020), 1-67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_32_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3688999"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3416294"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3416294"},{"key":"e_1_3_2_1_36_1","volume-title":"Sequential prediction of social media popularity with deep temporal context networks. arXiv preprint arXiv:1712.04443","author":"Wu Bo","year":"2017","unstructured":"Bo Wu, Wen-Huang Cheng, Yongdong Zhang, Qiushi Huang, Jintao Li, and Tao Mei. 2017. Sequential prediction of social media popularity with deep temporal context networks. arXiv preprint arXiv:1712.04443 (2017)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613853"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3688996"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.9970"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551576"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/2783258.2783401"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3763784","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:41:04Z","timestamp":1765309264000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3763784"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":42,"alternative-id":["10.1145\/3746027.3763784","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3763784","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}