{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T13:50:38Z","timestamp":1762609838214,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"HKUST Special Support for Young Faculty","award":["F0927"],"award-info":[{"award-number":["F0927"]}]},{"name":"HKUST Sports Science and Technology Research Grant","award":["SSTRG24EG04"],"award-info":[{"award-number":["SSTRG24EG04"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3657607","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"1084-1088","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Improving Data Augmentation for Robust Visual Question Answering with Effective Curriculum Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9628-1940","authenticated-orcid":false,"given":"Yuhang","family":"Zheng","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1091-7994","authenticated-orcid":false,"given":"Zhen","family":"Wang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6148-9709","authenticated-orcid":false,"given":"Long","family":"Chen","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Ehsan Abbasnejad Damien Teney Amin Parvaneh Javen Shi and Anton van den Hengel. 2020. Counterfactual vision and language learning. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01006"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Vedika Agarwal Rakshith Shetty and Mario Fritz. 2020. Towards Causal VQA: Reveling and Reducing Spurious Correlations by Invariant and Covariant Semantic Editing. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00971"},{"volume-title":"Don't just assume","author":"Agrawal Aishwarya","key":"e_1_3_2_1_3_1","unstructured":"Aishwarya Agrawal, Dhruv Batra, Devi Parikh, and Aniruddha Kembhavi. 2018. Don't just assume; look and answer: Overcoming priors for visual question answering. In CVPR."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-up and top-down attention for image captioning and visual question answering. In CVPR.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_5_1","volume-title":"Vqa: Visual question answering. In ICCV. 2425--2433.","author":"Antol Stanislaw","year":"2015","unstructured":"Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C Lawrence Zitnick, and Devi Parikh. 2015. Vqa: Visual question answering. In ICCV. 2425--2433."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Narjes Askarian Ehsan Abbasnejad Ingrid Zukerman Wray Buntine and Gholamreza Haffari. 2022. Inductive Biases for Low Data VQA: A Data Augmentation Approach. In WACV. 231--240.","DOI":"10.1109\/WACVW54805.2022.00029"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Yoshua Bengio J\u00e9r\u00f4me Louradour Ronan Collobert and Jason Weston. 2009. Curriculum learning. In ICML. 41--48.","DOI":"10.1145\/1553374.1553380"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Yonatan Bitton Gabriel Stanovsky Roy Schwartz and Michael Elhadad. 2021. Automatic Generation of Contrast Sets from Scene Graphs: Probing the Compositional Consistency of GQA. In NAACL. 94--105.","DOI":"10.18653\/v1\/2021.naacl-main.9"},{"key":"e_1_3_2_1_9_1","volume-title":"COIN: Counterfactual Image Generation for VQA Interpretation. In arXiv.","author":"Boukhers Zeyd","year":"2022","unstructured":"Zeyd Boukhers, Timo Hartmann, and Jan J\u00fcrjens. 2022. COIN: Counterfactual Image Generation for VQA Interpretation. In arXiv."},{"key":"e_1_3_2_1_10_1","unstructured":"Remi Cadene Corentin Dancette Hedi Ben-younes Matthieu Cord and Devi Parikh. 2019. RUBi: Reducing Unimodal Biases in Visual Question Answering. In NeurIPS."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Long Chen Zhihong Jiang Jun Xiao and Wei Liu. 2021. Human-like controllable image captioning with verb-specific semantic roles. In CVPR. 16846--16856.","DOI":"10.1109\/CVPR46437.2021.01657"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6627"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Long Chen Yulei Niu Brian Chen Xudong Lin Guangxing Han Christopher Thomas Hammad Ayyubi Heng Ji and Shih-Fu Chang. 2022a. Weakly-supervised temporal article grounding. In EMNLP.","DOI":"10.18653\/v1\/2022.emnlp-main.639"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Long Chen Xin Yan Jun Xiao Hanwang Zhang Shiliang Pu and Yueting Zhuang. 2020b. Counterfactual samples synthesizing for robust visual question answering. In CVPR. 10800--10809.","DOI":"10.1109\/CVPR42600.2020.01081"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Long Chen Yuhang Zheng Yulei Niu Hanwang Zhang and Jun Xiao. 2023. Counterfactual samples synthesizing and training for robust visual question answering. (2023).","DOI":"10.1109\/TPAMI.2023.3290012"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Long Chen Yuhang Zheng and Jun Xiao. 2022b. Rethinking data augmentation for robust visual question answering. In ECCV. 95--112.","DOI":"10.1007\/978-3-031-20059-5_6"},{"key":"e_1_3_2_1_17_1","unstructured":"Christopher Clark Mark Yatskar and Luke Zettlemoyer. 2019. Don't Take the Easy Way Out: Ensemble Based Methods for Avoiding Known Dataset Biases. In EMNLP."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Xinzhi Dong Chengjiang Long Wenju Xu and Chunxia Xiao. 2021. Dual graph convolutional networks with transformer and curriculum learning for image captioning. In ACM MM. 2615--2624.","DOI":"10.1145\/3474085.3475439"},{"key":"e_1_3_2_1_19_1","volume-title":"Mutant: A training paradigm for out-of-distribution generalization in visual question answering. In EMNLP.","author":"Gokhale Tejas","year":"2020","unstructured":"Tejas Gokhale, Pratyay Banerjee, Chitta Baral, and Yezhou Yang. 2020a. Mutant: A training paradigm for out-of-distribution generalization in visual question answering. In EMNLP."},{"key":"e_1_3_2_1_20_1","volume-title":"Vqa-lol: Visual question answering under the lens of logic. In ECCV. 379--396.","author":"Gokhale Tejas","year":"2020","unstructured":"Tejas Gokhale, Pratyay Banerjee, Chitta Baral, and Yezhou Yang. 2020b. Vqa-lol: Visual question answering under the lens of logic. In ECCV. 379--396."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Yash Goyal Tejas Khot Douglas Summers-Stay Dhruv Batra and Devi Parikh. 2017. Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In CVPR. 6904--6913.","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Kushal Kafle Mohammed Yousefhussien and Christopher Kanan. 2017. Data augmentation for visual question answering. In INLG. 198--202.","DOI":"10.18653\/v1\/W17-3529"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Yash Kant Abhinav Moudgil Dhruv Batra Devi Parikh and Harsh Agrawal. 2021. Contrast and classify: Training robust vqa models. In ICCV. 1604--1613.","DOI":"10.1109\/ICCV48922.2021.00163"},{"key":"e_1_3_2_1_24_1","unstructured":"Jihyung Kil Cheng Zhang Dong Xuan and Wei-Lun Chao. 2021. Discovering the Unknown Knowns: Turning Implicit Knowledge in the Dataset into Explicit Training Examples for Visual Question Answering. In EMNLP."},{"key":"e_1_3_2_1_25_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In ICLR."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Camila Kolling Martin More Nathan Gavenski Eduardo Pooch Ot\u00e1vio Parraga and Rodrigo C Barros. 2022. Efficient Counterfactual Debiasing for Visual Question Answering. In WACV. 3001--3010.","DOI":"10.1109\/WACV51458.2022.00263"},{"key":"e_1_3_2_1_27_1","volume-title":"Self-paced learning for latent variable models. NeurIPS","author":"Kumar M","year":"2010","unstructured":"M Kumar, Benjamin Packer, and Daphne Koller. 2010. Self-paced learning for latent variable models. NeurIPS (2010)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Mingrui Lao Yanming Guo Yu Liu Wei Chen Nan Pu and Michael S Lew. 2021. From superficial to deep: Language bias driven curriculum learning for visual question answering. In ACM MM. 3370--3379.","DOI":"10.1145\/3474085.3475492"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Chenxin Li Mingbao Lin Zhiyuan Ding Nie Lin Yihong Zhuang Yue Huang Xinghao Ding and Liujuan Cao. 2022. Knowledge condensation distillation. In ECCV. 19--35.","DOI":"10.1007\/978-3-031-20083-0_2"},{"key":"e_1_3_2_1_30_1","volume-title":"Hinton","author":"Rafael M\u00fc","year":"2019","unstructured":"Rafael M\u00fc ller, Simon Kornblith, and Geoffrey E. Hinton. 2019. When does label smoothing help?. In NeurIPS. 4696--4705."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Yulei Niu Kaihua Tang Hanwang Zhang Zhiwu Lu Xian-Sheng Hua and Ji-Rong Wen. 2021. Counterfactual vqa: A cause-effect look at language bias. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01251"},{"key":"e_1_3_2_1_32_1","volume-title":"Causal Inference with Knowledge Distilling and Curriculum Learning for Unbiased VQA. ACM TOMM","author":"Pan Yonghua","year":"2022","unstructured":"Yonghua Pan, Zechao Li, Liyan Zhang, and Jinhui Tang. 2022. Causal Inference with Knowledge Distilling and Curriculum Learning for Unbiased VQA. ACM TOMM (2022), 1--23."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Paul Hongsuck Seo Piyush Sharma Tomer Levinboim Bohyung Han and Radu Soricut. 2020. Reinforcing an image caption generator using off-line human feedback. In AAAI. 2693--2700.","DOI":"10.1609\/aaai.v34i03.5655"},{"key":"e_1_3_2_1_34_1","unstructured":"Zhiqiang Shen Zechun Liu Dejia Xu Zitian Chen Kwang-Ting Cheng and Marios Savvides. 2021. Is Label Smoothing Truly Incompatible with Knowledge Distillation: An Empirical Study. In ICLR."},{"key":"e_1_3_2_1_35_1","volume-title":"Qi Wu, and Xiaokang Yang.","author":"Tang Ruixue","year":"2020","unstructured":"Ruixue Tang, Chao Ma, Wei Emma Zhang, Qi Wu, and Xiaokang Yang. 2020. Semantic equivalent adversarial data augmentation for visual question answering. In ECCV. 437--453."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Damien Teney Ehsan Abbasnejad and Anton van den Hengel. 2021. Unshuffling data for improved generalization. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00145"},{"key":"e_1_3_2_1_37_1","unstructured":"Damien Teney Kushal Kafle Robik Shrestha Ehsan Abbasnejad Christopher Kanan and Anton van den Hengel. 2020. On the value of out-of-distribution testing: An example of goodhart's law. In NeurIPS."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3069908"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Zixu Wang Yishu Miao and Lucia Specia. 2021b. Cross-Modal Generative Augmentation for Visual Question Answering. In BMVC.","DOI":"10.5244\/C.35.429"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Yuan Yao Ao Zhang Xu Han Mengdi Li Cornelius Weber Zhiyuan Liu Stefan Wermter and Maosong Sun. 2021. Visual distant supervision for scene graph generation. In ICCV. 15816--15826.","DOI":"10.1109\/ICCV48922.2021.01552"},{"key":"e_1_3_2_1_41_1","volume-title":"Abdulmotaleb El Saddik, and Heng Tao Shen","author":"Zheng Chaofan","year":"2022","unstructured":"Chaofan Zheng, Lianli Gao, Xinyu Lyu, Pengpeng Zeng, Abdulmotaleb El Saddik, and Heng Tao Shen. 2022. Dual-branch Hybrid Learning Network for Unbiased Scene Graph Generation. arXiv (2022)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Xi Zhu Zhendong Mao Chunxiao Liu Peng Zhang Bin Wang and Yongdong Zhang. 2020. Overcoming language priors with self-supervised learning for visual question answering. In IJCAI.","DOI":"10.24963\/ijcai.2020\/151"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"Phuket Thailand","acronym":"ICMR '24"},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3657607","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3657607","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:45:37Z","timestamp":1755765937000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3657607"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":42,"alternative-id":["10.1145\/3652583.3657607","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3657607","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}