{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:03:18Z","timestamp":1750309398053,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["No. 2023YFF1205001"],"award-info":[{"award-number":["No. 2023YFF1205001"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beijing Key Lab of Networked Multimedia"},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62250008, 62222209, 62102222"],"award-info":[{"award-number":["No. 62250008, 62222209, 62102222"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100017582","name":"Beijing National Research Center For Information Science And Technology","doi-asserted-by":"publisher","award":["No. BNR2023RC01003, BNR2023TD03006"],"award-info":[{"award-number":["No. BNR2023RC01003, BNR2023TD03006"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100017582","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681502","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"8005-8014","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Neighbor Does Matter: Curriculum Global Positive-Negative Sampling for Vision-Language Pre-training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-2504-3689","authenticated-orcid":false,"given":"Bin","family":"Huang","sequence":"first","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1396-375X","authenticated-orcid":false,"given":"Feng","family":"He","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3474-0643","authenticated-orcid":false,"given":"Qi","family":"Wang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0943-2286","authenticated-orcid":false,"given":"Hong","family":"Chen","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7796-5378","authenticated-orcid":false,"given":"Guohao","family":"Li","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0545-9183","authenticated-orcid":false,"given":"Zhifan","family":"Feng","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0351-2939","authenticated-orcid":false,"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"DCST, BNRist, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2236-9290","authenticated-orcid":false,"given":"Wenwu","family":"Zhu","sequence":"additional","affiliation":[{"name":"DCST, BNRist, Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"doi-asserted-by":"publisher","key":"e_1_3_2_1_1_1","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_2_1","volume-title":"Kriti Aggarwal, Subhojit Som, and Furu Wei.","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Wenhui Wang, Li Dong, Qiang Liu, Owais Khan Mohammed, Kriti Aggarwal, Subhojit Som, and Furu Wei. 2021. Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. arXiv preprint arXiv:2111.02358 (2021)."},{"key":"e_1_3_2_1_3_1","volume-title":"Representation learning: A review and new perspectives","author":"Bengio Yoshua","year":"2013","unstructured":"Yoshua Bengio, Aaron Courville, and Pascal Vincent. 2013. Representation learning: A review and new perspectives. IEEE transactions on pattern analysis and machine intelligence, Vol. 35, 8 (2013), 1798--1828."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_4_1","DOI":"10.1145\/1553374.1553380"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_5_1","DOI":"10.1007\/978-3-031-19800-7_23"},{"key":"e_1_3_2_1_6_1","first-page":"26924","article-title":"Curriculum disentangled recommendation with noisy multi-feedback","volume":"34","author":"Chen Hong","year":"2021","unstructured":"Hong Chen, Yudong Chen, Xin Wang, Ruobing Xie, Rui Wang, Feng Xia, and Wenwu Zhu. 2021. Curriculum disentangled recommendation with noisy multi-feedback. Advances in Neural Information Processing Systems, Vol. 34 (2021), 26924--26936.","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_7_1","DOI":"10.1145\/3581783.3612504"},{"key":"e_1_3_2_1_8_1","volume-title":"International conference on machine learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597--1607."},{"key":"e_1_3_2_1_9_1","volume-title":"Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297","author":"Chen Xinlei","year":"2020","unstructured":"Xinlei Chen, Haoqi Fan, Ross Girshick, and Kaiming He. 2020. Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297 (2020)."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings, Part XXX. Springer, 104--120","author":"Chen Yen-Chun","year":"2020","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2020. Uniter: Universal image-text representation learning. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX. Springer, 104--120."},{"unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020).","key":"e_1_3_2_1_11_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_12_1","DOI":"10.1109\/CVPR52688.2022.01763"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_13_1","DOI":"10.1109\/ICCV48922.2021.00945"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_14_1","DOI":"10.2307\/2346830"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_15_1","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Machine Learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904--4916."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_17_1","DOI":"10.1109\/TBDATA.2019.2921572"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_18_1","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_19_1","volume-title":"International Conference on Machine Learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583--5594."},{"doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David A Shamma et al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision Vol. 123 (2017) 32--73.","key":"e_1_3_2_1_20_1","DOI":"10.1007\/s11263-016-0981-7"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_21_1","DOI":"10.1609\/aaai.v37i1.25204"},{"key":"e_1_3_2_1_22_1","volume-title":"Curriculum graph machine learning: A survey. arXiv preprint arXiv:2302.02926","author":"Li Haoyang","year":"2023","unstructured":"Haoyang Li, Xin Wang, and Wenwu Zhu. 2023. Curriculum graph machine learning: A survey. arXiv preprint arXiv:2302.02926 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_25_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings, Part XXX 16","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings, Part V 13","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part V 13. Springer, 740--755."},{"key":"e_1_3_2_1_28_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, Vol. 32 (2019)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_30_1","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_31_1","volume-title":"Im2text: Describing images using 1 million captioned photographs. Advances in neural information processing systems","author":"Ordonez Vicente","year":"2011","unstructured":"Vicente Ordonez, Girish Kulkarni, and Tamara Berg. 2011. Im2text: Describing images using 1 million captioned photographs. Advances in neural information processing systems, Vol. 24 (2011)."},{"key":"e_1_3_2_1_32_1","volume-title":"Multi-task graph neural architecture search with task-aware collaboration and curriculum. Advances in neural information processing systems","author":"Qin Yijian","year":"2024","unstructured":"Yijian Qin, Xin Wang, Ziwei Zhang, Hong Chen, and Wenwu Zhu. 2024. Multi-task graph neural architecture search with task-aware collaboration and curriculum. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_34_1","volume-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_35_1","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_36_1","volume-title":"A corpus for reasoning about natural language grounded in photographs. arXiv preprint arXiv:1811.00491","author":"Suhr Alane","year":"2018","unstructured":"Alane Suhr, Stephanie Zhou, Ally Zhang, Iris Zhang, Huajun Bai, and Yoav Artzi. 2018. A corpus for reasoning about natural language grounded in photographs. arXiv preprint arXiv:1811.00491 (2018)."},{"key":"e_1_3_2_1_37_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_1_38_1","volume-title":"What makes for good views for contrastive learning? Advances in neural information processing systems","author":"Tian Yonglong","year":"2020","unstructured":"Yonglong Tian, Chen Sun, Ben Poole, Dilip Krishnan, Cordelia Schmid, and Phillip Isola. 2020. What makes for good views for contrastive learning? Advances in neural information processing systems, Vol. 33 (2020), 6827--6839."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_39_1","DOI":"10.1109\/TPAMI.2021.3069908"},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning. PMLR, 36174--36192","author":"Wang Xin","year":"2023","unstructured":"Xin Wang, Zirui Pan, Yuwei Zhou, Hong Chen, Chendi Ge, and Wenwu Zhu. 2023. Curriculum co-disentangled representation learning across multiple environments for social recommendation. In International Conference on Machine Learning. PMLR, 36174--36192."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_41_1","DOI":"10.1145\/3589335.3641257"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_42_1","DOI":"10.1145\/3581783.3612709"},{"key":"e_1_3_2_1_43_1","volume-title":"Visual entailment: A novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706","author":"Xie Ning","year":"2019","unstructured":"Ning Xie, Farley Lai, Derek Doran, and Asim Kadav. 2019. Visual entailment: A novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706 (2019)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_44_1","DOI":"10.1109\/CVPR52688.2022.01522"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_45_1","DOI":"10.1609\/aaai.v38i15.29580"},{"key":"e_1_3_2_1_46_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_47_1","DOI":"10.1145\/3503161.3548549"},{"volume-title":"CurBench: Curriculum Learning Benchmark. In Forty-first International Conference on Machine Learning.","author":"Zhou Yuwei","unstructured":"Yuwei Zhou, Zirui Pan, Xin Wang, Hong Chen, Haoyang Li, Yanwen Huang, Zhixiao Xiong, Fangzhou Xiong, Peiyang Xu, Wenwu Zhu, et al. [n.,d.]. CurBench: Curriculum Learning Benchmark. In Forty-first International Conference on Machine Learning.","key":"e_1_3_2_1_48_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_49_1","DOI":"10.1145\/3503161.3548271"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_50_1","DOI":"10.1145\/3581783.3612468"}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"MM '24","name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681502","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681502","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:48Z","timestamp":1750294668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681502"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":50,"alternative-id":["10.1145\/3664647.3681502","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681502","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}