{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:15:23Z","timestamp":1775578523353,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Guangdong Basic and Applied Basic Research Foundation","award":["2023A1515010688 and 2022A1515011018"],"award-info":[{"award-number":["2023A1515010688 and 2022A1515011018"]}]},{"name":"Natural Science Foundation of China","award":["62206180, 82261138629 and 12326610"],"award-info":[{"award-number":["62206180, 82261138629 and 12326610"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100015805","name":"Shenzhen Municipal Science and Technology Innovation Council","doi-asserted-by":"publisher","award":["JCYJ20220531101412030"],"award-info":[{"award-number":["JCYJ20220531101412030"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100015805","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100013261","name":"Guangdong Provincial Key Laboratory","doi-asserted-by":"publisher","award":["2023B1212060076"],"award-info":[{"award-number":["2023B1212060076"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100013261","id-type":"DOI","asserted-by":"publisher"}]},{"name":"XJTLU Research Development Funds","award":["RDF-23-01-053"],"award-info":[{"award-number":["RDF-23-01-053"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681287","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"58-67","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["FLIP-80M: 80 Million Visual-Linguistic Pairs for Facial Language-Image Pre-Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6779-8836","authenticated-orcid":false,"given":"Yudong","family":"Li","sequence":"first","affiliation":[{"name":"School of Computer Science and Software Engineering, Shenzhen University &amp; Shenzhen Institute of Artificial Intelligence and Robotics for Society, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8728-2842","authenticated-orcid":false,"given":"Xianxu","family":"Hou","sequence":"additional","affiliation":[{"name":"School of AI and Advanced Computing, Xi'an Jiaotong-Liverpool University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8683-3635","authenticated-orcid":false,"given":"Zheng","family":"Dezhi","sequence":"additional","affiliation":[{"name":"School of Computer Science and Software Engineering, Shenzhen University &amp; Guangdong Provincial Key Laboratory of Intelligent Information Processing, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1420-0815","authenticated-orcid":false,"given":"Linlin","family":"Shen","sequence":"additional","affiliation":[{"name":"School of Computer Science and Software Engineering, Shenzhen University &amp; Shenzhen, Shenzhen Institute of Artificial Intelligence and Robotics for Society, Shenzhen, Chile"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9496-5917","authenticated-orcid":false,"given":"Zhe","family":"Zhao","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"James Betker Gabriel Goh Li Jing Tim Brooks Jianfeng Wang Linjie Li Long Ouyang Juntang Zhuang Joyce Lee Yufei Guo et al. 2023. Improving image generation with better captions. Computer Science. https:\/\/cdn. openai. com\/papers\/dall-e-3. pdf Vol. 2 3 (2023) 8."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00615"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19778-9_7"},{"key":"e_1_3_2_1_5_1","volume-title":"Subpixel heatmap regression for facial landmark localization. arXiv preprint arXiv:2111.02360","author":"Bulat Adrian","year":"2021","unstructured":"Adrian Bulat, Enrique Sanchez, and Georgios Tzimiropoulos. 2021. Subpixel heatmap regression for facial landmark localization. arXiv preprint arXiv:2111.02360 (2021)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00451"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_49"},{"key":"e_1_3_2_1_9_1","volume-title":"Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793","author":"Chen Lin","year":"2023","unstructured":"Lin Chen, Jisong Li, Xiaoyi Dong, Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, and Dahua Lin. 2023. Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"e_1_3_2_1_11_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"Deep Facial Synthesis: A New Challenge. arXiv preprint arXiv:2112.15439","author":"Fan Deng-Ping","year":"2021","unstructured":"Deng-Ping Fan, Ziling Huang, Peng Zheng, Hong Liu, Xuebin Qin, and Luc Van Gool. 2021. Deep Facial Synthesis: A New Challenge. arXiv preprint arXiv:2112.15439 (2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1349-9"},{"key":"e_1_3_2_1_14_1","unstructured":"Samir Yitzhak Gadre Gabriel Ilharco Alex Fang Jonathan Hayase Georgios Smyrnis Thao Nguyen Ryan Marten Mitchell Wortsman Dhruba Ghosh Jieyu Zhang et al. 2023. DataComp: In search of the next generation of multimodal datasets. arXiv preprint arXiv:2304.14108 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Heterogeneous face attribute estimation: A deep multi-task learning approach","author":"Han Hu","year":"2017","unstructured":"Hu Han, Anil K Jain, Fang Wang, Shiguang Shan, and Xilin Chen. 2017. Heterogeneous face attribute estimation: A deep multi-task learning approach. IEEE transactions on pattern analysis and machine intelligence, Vol. 40, 11 (2017), 2597--2609."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25176"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00307"},{"key":"e_1_3_2_1_18_1","volume-title":"Fatemeh Ghezloo, Dylan Stefan Chan Geva, Fatwir Sheikh Mohammed, Pavan Kumar Anand, Ranjay Krishna, and Linda Shapiro.","author":"Ikezogwo Wisdom Oluchi","year":"2023","unstructured":"Wisdom Oluchi Ikezogwo, Mehmet Saygin Seyfioglu, Fatemeh Ghezloo, Dylan Stefan Chan Geva, Fatwir Sheikh Mohammed, Pavan Kumar Anand, Ranjay Krishna, and Linda Shapiro. 2023. Quilt-1M: One Million Image-Text Pairs for Histopathology. arXiv preprint arXiv:2306.11207 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904--4916."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01521-4"},{"key":"e_1_3_2_1_21_1","volume-title":"Fairface: Face attribute dataset for balanced race, gender, and age. arXiv preprint arXiv:1908.04913","author":"K\u00e4rkk\u00e4inen Kimmo","year":"2019","unstructured":"Kimmo K\u00e4rkk\u00e4inen and Jungseock Joo. 2019. Fairface: Face attribute dataset for balanced race, gender, and age. arXiv preprint arXiv:1908.04913 (2019)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00159"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00826"},{"key":"e_1_3_2_1_25_1","volume-title":"Hih: Towards more accurate face alignment via heatmap in heatmap. arXiv preprint arXiv:2104.03100","author":"Lan Xing","year":"2021","unstructured":"Xing Lan, Qinghao Hu, Qiang Chen, Jian Xue, and Jian Cheng. 2021. Hih: Towards more accurate face alignment via heatmap in heatmap. arXiv preprint arXiv:2104.03100 (2021)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414037"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00559"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00414"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/152"},{"key":"e_1_3_2_1_30_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.277"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548205"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01469"},{"key":"e_1_3_2_1_34_1","volume-title":"Improved Baselines with Visual Instruction Tuning. In NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following.","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved Baselines with Visual Instruction Tuning. In NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following."},{"key":"e_1_3_2_1_35_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6832"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.425"},{"key":"e_1_3_2_1_38_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2018","unstructured":"Ilya Loshchilov and Frank Hutter. 2018. Decoupled Weight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_39_1","volume-title":"Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781","author":"Mikolov Tomas","year":"2013","unstructured":"Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. 2013. Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2017.2740923"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.532"},{"key":"e_1_3_2_1_42_1","volume-title":"Shape preserving facial landmarks with graph attention networks. arXiv preprint arXiv:2210.07233","author":"Prados-Torreblanca Andr\u00e9s","year":"2022","unstructured":"Andr\u00e9s Prados-Torreblanca, Jos\u00e9 M Buenaposada, and Luis Baumela. 2022. Shape preserving facial landmarks with graph attention networks. arXiv preprint arXiv:2210.07233 (2022)."},{"key":"e_1_3_2_1_43_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0940-3"},{"key":"e_1_3_2_1_45_1","volume-title":"300 faces in-the-wild challenge: Database and results. Image and vision computing","author":"Sagonas Christos","year":"2016","unstructured":"Christos Sagonas, Epameinondas Antonakos, Georgios Tzimiropoulos, Stefanos Zafeiriou, and Maja Pantic. 2016. 300 faces in-the-wild challenge: Database and results. Image and vision computing, Vol. 47 (2016), 3--18."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.59"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2013.132"},{"key":"e_1_3_2_1_48_1","first-page":"25278","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 25278--25294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_49_1","volume-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475391"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3113780"},{"key":"e_1_3_2_1_52_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3032029"},{"key":"e_1_3_2_1_54_1","volume-title":"Self-instruct: Aligning language models with self-generated instructions. arXiv preprint arXiv:2212.10560","author":"Wang Yizhong","year":"2022","unstructured":"Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, Alisa Liu, Noah A Smith, Daniel Khashabi, and Hannaneh Hajishirzi. 2022. Self-instruct: Aligning language models with self-generated instructions. arXiv preprint arXiv:2212.10560 (2022)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.754"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00227"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00402"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-017-1055-1"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"crossref","unstructured":"Zhe Zhao Yudong Li Cheng Hou Jing Zhao Rong Tian Weijie Liu Yiren Chen Ningyuan Sun Haoyan Liu Weiquan Mao et al. 2022. TencentPretrain: A Scalable and Flexible Toolkit for Pre-training Models of Different Modalities. arXiv preprint arXiv:2212.06385 (2022).","DOI":"10.18653\/v1\/2023.acl-demo.20"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01814"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.319"},{"key":"e_1_3_2_1_63_1","volume-title":"Generative Adversarial Network for Text-to-Face Synthesis and Manipulation with Pretrained BERT Model. In IEEE International Conference on Automatic Face and Gesture Recognition. 01--08","author":"Zhou Yutong","year":"2021","unstructured":"Yutong Zhou and Nobutaka Shimada. 2021. Generative Adversarial Network for Text-to-Face Synthesis and Manipulation with Pretrained BERT Model. In IEEE International Conference on Automatic Face and Gesture Recognition. 01--08."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01485"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.371"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681287","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681287","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:43Z","timestamp":1750295863000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681287"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":65,"alternative-id":["10.1145\/3664647.3681287","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681287","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}