{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:13:21Z","timestamp":1777655601888,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":20,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China under Grant","award":["62072469"],"award-info":[{"award-number":["62072469"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,13]]},"DOI":"10.1145\/3589335.3651939","type":"proceedings-article","created":{"date-parts":[[2024,5,12]],"date-time":"2024-05-12T18:41:21Z","timestamp":1715539281000},"page":"1578-1585","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Federated Learning in Large Model Era: Vision-Language Model for Smart City Safety Operation Management"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1462-9905","authenticated-orcid":false,"given":"Zengxiang","family":"Li","sequence":"first","affiliation":[{"name":"ENN Group Co.,Ltd., Langfang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9795-0193","authenticated-orcid":false,"given":"Zhaoxiang","family":"Hou","sequence":"additional","affiliation":[{"name":"ENN Group Co.,Ltd., Langfang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6760-7945","authenticated-orcid":false,"given":"Hui","family":"Liu","sequence":"additional","affiliation":[{"name":"ENN Group Co.,Ltd., Langfang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2596-7058","authenticated-orcid":false,"given":"Tongzhi","family":"Li","sequence":"additional","affiliation":[{"name":"ENN Group Co.,Ltd., Langfang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3821-2589","authenticated-orcid":false,"given":"Chengyi","family":"Yang","sequence":"additional","affiliation":[{"name":"ENN Group Co.,Ltd., Langfang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8571-7340","authenticated-orcid":false,"given":"Ying","family":"Wang","sequence":"additional","affiliation":[{"name":"ENN Group Co.,Ltd., Langfang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9704-533X","authenticated-orcid":false,"given":"Chao","family":"Shi","sequence":"additional","affiliation":[{"name":"ENN Group Co.,Ltd., Langfang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7465-5729","authenticated-orcid":false,"given":"Longfei","family":"Xie","sequence":"additional","affiliation":[{"name":"ENN Group Co.,Ltd., Langfang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9800-1068","authenticated-orcid":false,"given":"Weishan","family":"Zhang","sequence":"additional","affiliation":[{"name":"China University of Petroleum, Qingdao, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3328-8106","authenticated-orcid":false,"given":"Liang","family":"Xu","sequence":"additional","affiliation":[{"name":"Qingdao Windaka Technology Co.,Ltd., Qingdao, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1236-531X","authenticated-orcid":false,"given":"Zelei","family":"Liu","sequence":"additional","affiliation":[{"name":"Unicom (Shanghai) Industrial Internet Co.,Ltd., Shanghai Shi, China"}]}],"member":"320","published-online":{"date-parts":[[2024,5,13]]},"reference":[{"key":"e_1_3_2_2_1_1","first-page":"23716","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems 35 (2022), 23716--23736.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01007"},{"key":"e_1_3_2_2_3_1","volume-title":"Turian","author":"Bisk Yonatan","year":"2020","unstructured":"Yonatan Bisk, Ari Holtzman, Jesse Thomason, Jacob Andreas, Yoshua Bengio, Joyce Chai, Mirella Lapata, Angeliki Lazaridou, Jonathan May, Aleksandr Nisnevich, Nicolas Pinto, and Joseph P. Turian. 2020. Experience Grounds Language. CoRR abs\/2004.10151 (2020). arXiv:2004.10151 https:\/\/arxiv.org\/abs\/2004.10151"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1093\/jamia\/ocy017"},{"key":"e_1_3_2_2_5_1","volume-title":"Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al.","author":"Driess Danny","year":"2023","unstructured":"Danny Driess, Fei Xia, Mehdi SM Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al. 2023. Palm-e: An embodied multimodal language model. arXiv preprint arXiv:2303.03378 (2023)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Yulan Gao Hao Sun Zengxiang Li and Han Yu. 2023. The Prospect of Enhancing Large-Scale Heterogeneous Federated Learning with Transformers.","DOI":"10.1109\/ICME57554.2024.10688130"},{"key":"e_1_3_2_2_7_1","volume-title":"International conference on learning representations","volume":"1","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Norman Mu, Ekin Dogus Cubuk, Barret Zoph, Justin Gilmer, and Balaji Lakshminarayanan. 2020. Augmix: A simple method to improve robustness and uncertainty under data shift. In International conference on learning representations, Vol. 1. 5."},{"key":"e_1_3_2_2_8_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander C Berg Wan-Yen Lo et al. 2023. Segment anything. arXiv preprint arXiv:2304.02643 (2023).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_2_10_1","volume-title":"Continual learning for robotics: Definition, framework, learning strategies, opportunities and challenges. Information fusion 58","author":"Lesort Timoth\u00e9e","year":"2020","unstructured":"Timoth\u00e9e Lesort, Vincenzo Lomonaco, Andrei Stoian, Davide Maltoni, David Filliat, and Natalia D\u00edaz-Rodr\u00edguez. 2020. Continual learning for robotics: Definition, framework, learning strategies, opportunities and challenges. Information fusion 58 (2020), 52--68."},{"key":"e_1_3_2_2_11_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_2_12_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34 (2021), 9694--9705."},{"key":"e_1_3_2_2_13_1","unstructured":"Brendan McMahan Eider Moore Daniel Ramage Seth Hampson and Blaise Aguera y Arcas. 2017. Communication-efficient learning of deep networks from decentralized data. In Artificial intelligence and statistics. PMLR 1273--1282."},{"key":"e_1_3_2_2_14_1","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. 2022. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems 35 (2022), 27730--27744.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00982"},{"key":"e_1_3_2_2_16_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_17_1","volume-title":"International Conference on Machine Learning. PMLR, 23318--23340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In International Conference on Machine Learning. PMLR, 23318--23340."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01100"},{"key":"e_1_3_2_2_19_1","volume-title":"Asynchronous federated optimization. arXiv preprint arXiv:1903.03934","author":"Xie Cong","year":"2019","unstructured":"Cong Xie, Sanmi Koyejo, and Indranil Gupta. 2019. Asynchronous federated optimization. arXiv preprint arXiv:1903.03934 (2019)."},{"key":"e_1_3_2_2_20_1","volume-title":"Multimodal Federated Learning via Contrastive Representation Ensemble. arXiv preprint arXiv:2302.08888","author":"Yu Qiying","year":"2023","unstructured":"Qiying Yu, Yang Liu, Yimu Wang, Ke Xu, and Jingjing Liu. 2023. Multimodal Federated Learning via Contrastive Representation Ensemble. arXiv preprint arXiv:2302.08888 (2023)."}],"event":{"name":"WWW '24: The ACM Web Conference 2024","location":"Singapore Singapore","acronym":"WWW '24","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Companion Proceedings of the ACM Web Conference 2024"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3589335.3651939","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3589335.3651939","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:26:10Z","timestamp":1755822370000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3589335.3651939"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":20,"alternative-id":["10.1145\/3589335.3651939","10.1145\/3589335"],"URL":"https:\/\/doi.org\/10.1145\/3589335.3651939","relation":{},"subject":[],"published":{"date-parts":[[2024,5,13]]},"assertion":[{"value":"2024-05-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}