{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,30]],"date-time":"2025-12-30T08:48:53Z","timestamp":1767084533910,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":84,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["No. 2020AAA0106300"],"award-info":[{"award-number":["No. 2020AAA0106300"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62250008, 62222209, 62102222"],"award-info":[{"award-number":["No. 62250008, 62222209, 62102222"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017582","name":"Beijing National Research Center For Information Science And Technology","doi-asserted-by":"publisher","award":["No. BNR2023RC01003, BNR2023TD03006"],"award-info":[{"award-number":["No. BNR2023RC01003, BNR2023TD03006"]}],"id":[{"id":"10.13039\/501100017582","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beijing Key Lab of Networked Multimedia"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612468","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"3724-3735","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":26,"title":["Intra- and Inter-Modal Curriculum for Multimodal Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9582-7331","authenticated-orcid":false,"given":"Yuwei","family":"Zhou","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0351-2939","authenticated-orcid":false,"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0943-2286","authenticated-orcid":false,"given":"Hong","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9108-9618","authenticated-orcid":false,"given":"Xuguang","family":"Duan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2236-9290","authenticated-orcid":false,"given":"Wenwu","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00522"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_2_3_1","volume-title":"Large scale distributed neural network training through online distillation. arXiv preprint arXiv:1804.03235","author":"Anil Rohan","year":"2018","unstructured":"Rohan Anil, Gabriel Pereyra, Alexandre Passos, Robert Ormandi, George E Dahl, and Geoffrey E Hinton. 2018. Large scale distributed neural network training through online distillation. arXiv preprint arXiv:1804.03235 (2018)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_2_5_1","volume-title":"Manuel Montes-y G\u00f3mez, and Fabio A Gonz\u00e1lez","author":"Arevalo John","year":"2017","unstructured":"John Arevalo, Thamar Solorio, Manuel Montes-y G\u00f3mez, and Fabio A Gonz\u00e1lez. 2017. Gated multimodal units for information fusion. arXiv preprint arXiv:1702.01992 (2017)."},{"key":"e_1_3_2_2_6_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Tadas Baltruvs","year":"2018","unstructured":"Tadas Baltruvs aitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 2 (2018), 423--443."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553380"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_2_2_9_1","volume-title":"A large annotated corpus for learning natural language inference. arXiv preprint arXiv:1508.05326","author":"Bowman Samuel R","year":"2015","unstructured":"Samuel R Bowman, Gabor Angeli, Christopher Potts, and Christopher D Manning. 2015. A large annotated corpus for learning natural language inference. arXiv preprint arXiv:1508.05326 (2015)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/1150402.1150464"},{"key":"e_1_3_2_2_11_1","volume-title":"Crema-d: Crowd-sourced emotional multimodal actors dataset","author":"Cao Houwei","year":"2014","unstructured":"Houwei Cao, David G Cooper, Michael K Keutmann, Ruben C Gur, Ani Nenkova, and Ragini Verma. 2014. Crema-d: Crowd-sourced emotional multimodal actors dataset. IEEE transactions on affective computing, Vol. 5, 4 (2014), 377--390."},{"key":"e_1_3_2_2_12_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Castells Thibault","year":"2020","unstructured":"Thibault Castells, Philippe Weinzaepfel, and Jerome Revaud. 2020. SuperLoss: A Generic Loss for Robust Curriculum Learning. Advances in Neural Information Processing Systems, Vol. 33 (2020)."},{"key":"e_1_3_2_2_13_1","volume-title":"Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:1906.01815","author":"Castro Santiago","year":"2019","unstructured":"Santiago Castro, Devamanyu Hazarika, Ver\u00f3nica P\u00e9rez-Rosas, Roger Zimmermann, Rada Mihalcea, and Soujanya Poria. 2019. Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:1906.01815 (2019)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00348"},{"key":"e_1_3_2_2_16_1","volume-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555","author":"Chung Junyoung","year":"2014","unstructured":"Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126594.3126651"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.crma.2012.03.014"},{"key":"e_1_3_2_2_19_1","volume-title":"Improving multi-modal learning with uni-modal teachers. arXiv preprint arXiv:2106.11059","author":"Du Chenzhuang","year":"2021","unstructured":"Chenzhuang Du, Tingle Li, Yichen Liu, Zixin Wen, Tianyu Hua, Yue Wang, and Hang Zhao. 2021. Improving multi-modal learning with uni-modal teachers. arXiv preprint arXiv:2106.11059 (2021)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-22199-0"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.10809"},{"key":"e_1_3_2_2_22_1","volume-title":"Steepest descent methods for multicriteria optimization. Mathematical methods of operations research","author":"Fliege J\u00f6rg","year":"2000","unstructured":"J\u00f6rg Fliege and Benar Fux Svaiter. 2000. Steepest descent methods for multicriteria optimization. Mathematical methods of operations research, Vol. 51 (2000), 479--494."},{"key":"e_1_3_2_2_23_1","volume-title":"International Conference on Machine Learning. PMLR, 1607--1616","author":"Furlanello Tommaso","year":"2018","unstructured":"Tommaso Furlanello, Zachary Lipton, Michael Tschannen, Laurent Itti, and Anima Anandkumar. 2018. Born again neural networks. In International Conference on Machine Learning. PMLR, 1607--1616."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01453-z"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_2_26_1","volume-title":"international conference on machine learning. PMLR, 1311--1320","author":"Graves Alex","year":"2017","unstructured":"Alex Graves, Marc G Bellemare, Jacob Menick, Remi Munos, and Koray Kavukcuoglu. 2017. Automated curriculum learning for neural networks. In international conference on machine learning. PMLR, 1311--1320."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2916887"},{"key":"e_1_3_2_2_28_1","volume-title":"International Conference on Machine Learning. PMLR, 2535--2544","author":"Hacohen Guy","year":"2019","unstructured":"Guy Hacohen and Daphna Weinshall. 2019. On the power of curriculum learning in training deep networks. In International Conference on Machine Learning. PMLR, 2535--2544."},{"key":"e_1_3_2_2_29_1","volume-title":"Louis-Philippe Morency, et al.","author":"Hasan Md Kamrul","year":"2019","unstructured":"Md Kamrul Hasan, Wasifur Rahman, Amir Zadeh, Jianyuan Zhong, Md Iftekhar Tanveer, Louis-Philippe Morency, et al. 2019. UR-FUNNY: A multimodal language dataset for understanding humor. arXiv preprint arXiv:1904.06618 (2019)."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_31_1","volume-title":"International conference on machine learning. PMLR, 1492--1501","author":"Hern\u00e1ndez-Lobato Daniel","year":"2016","unstructured":"Daniel Hern\u00e1ndez-Lobato, Jose Hernandez-Lobato, Amar Shah, and Ryan Adams. 2016. Predictive entropy search for multi-objective bayesian optimization. In International conference on machine learning. PMLR, 1492--1501."},{"key":"e_1_3_2_2_32_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_2_33_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation, Vol. 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_2_34_1","volume-title":"International Conference on Machine Learning. PMLR, 9938--9964","author":"Javaloy Adri\u00e1n","year":"2022","unstructured":"Adri\u00e1n Javaloy, Maryam Meghdadi, and Isabel Valera. 2022. Mitigating Modality Collapse in Multimodal VAEs via Impartial Optimization. In International Conference on Machine Learning. PMLR, 9938--9964."},{"key":"e_1_3_2_2_35_1","volume-title":"Leo Anthony Celi, and Roger G Mark","author":"Johnson Alistair EW","year":"2016","unstructured":"Alistair EW Johnson, Tom J Pollard, Lu Shen, Li-wei H Lehman, Mengling Feng, Mohammad Ghassemi, Benjamin Moody, Peter Szolovits, Leo Anthony Celi, and Roger G Mark. 2016. MIMIC-III, a freely accessible critical care database. Scientific data, Vol. 3, 1 (2016), 1--9."},{"key":"e_1_3_2_2_36_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_2_37_1","first-page":"2611","article-title":"The hateful memes challenge: Detecting hate speech in multimodal memes","volume":"33","author":"Kiela Douwe","year":"2020","unstructured":"Douwe Kiela, Hamed Firooz, Aravind Mohan, Vedanuj Goswami, Amanpreet Singh, Pratik Ringshia, and Davide Testuggine. 2020. The hateful memes challenge: Detecting hate speech in multimodal memes. Advances in Neural Information Processing Systems, Vol. 33 (2020), 2611--2624.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_38_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_2_39_1","first-page":"2","article-title":"Self-Paced Learning for Latent Variable Models","volume":"1","author":"Kumar M Pawan","year":"2010","unstructured":"M Pawan Kumar, Benjamin Packer, and Daphne Koller. 2010. Self-Paced Learning for Latent Variable Models. In NIPS, Vol. 1. 2.","journal-title":"NIPS"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561847"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9341579"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2019.2959445"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3406324.3410710"},{"key":"e_1_3_2_2_44_1","volume-title":"Pareto-path multitask multiple kernel learning","author":"Li Cong","year":"2014","unstructured":"Cong Li, Michael Georgiopoulos, and Georgios C Anagnostopoulos. 2014. Pareto-path multitask multiple kernel learning. IEEE transactions on neural networks and learning systems, Vol. 26, 1 (2014), 51--61."},{"key":"e_1_3_2_2_45_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_2_46_1","volume-title":"Multibench: Multiscale benchmarks for multimodal representation learning. arXiv preprint arXiv:2107.07502","author":"Liang Paul Pu","year":"2021","unstructured":"Paul Pu Liang, Yiwei Lyu, Xiang Fan, Zetian Wu, Yun Cheng, Jason Wu, Leslie Chen, Peter Wu, Michelle A Lee, Yuke Zhu, et al. 2021. Multibench: Multiscale benchmarks for multimodal representation learning. arXiv preprint arXiv:2107.07502 (2021)."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_2_48_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"volume-title":"Nonlinear multiobjective optimization","author":"Miettinen Kaisa","key":"e_1_3_2_2_49_1","unstructured":"Kaisa Miettinen. 1999. Nonlinear multiobjective optimization. Vol. 12. Springer Science & Business Media."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10589-017-9921-x"},{"key":"e_1_3_2_2_52_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_53_1","volume-title":"Deep multimodal learning: A survey on recent advances and trends","author":"Ramachandram Dhanesh","year":"2017","unstructured":"Dhanesh Ramachandram and Graham W Taylor. 2017. Deep multimodal learning: A survey on recent advances and trends. IEEE signal processing magazine, Vol. 34, 6 (2017), 96--108."},{"key":"e_1_3_2_2_54_1","volume-title":"International conference on machine learning. PMLR, 4334--4343","author":"Ren Mengye","year":"2018","unstructured":"Mengye Ren, Wenyuan Zeng, Bin Yang, and Raquel Urtasun. 2018. Learning to reweight examples for robust deep learning. In International conference on machine learning. PMLR, 4334--4343."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3987"},{"key":"e_1_3_2_2_56_1","volume-title":"Antoine Chassang, Carlo Gatta, and Yoshua Bengio.","author":"Romero Adriana","year":"2014","unstructured":"Adriana Romero, Nicolas Ballas, Samira Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio. 2014. Fitnets: Hints for thin deep nets. arXiv preprint arXiv:1412.6550 (2014)."},{"key":"e_1_3_2_2_57_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_2_58_1","volume-title":"Multimodal deep learning for short-term stock volatility prediction. arXiv preprint arXiv:1812.10479","author":"Sardelich Marcelo","year":"2018","unstructured":"Marcelo Sardelich and Suresh Manandhar. 2018. Multimodal deep learning for short-term stock volatility prediction. arXiv preprint arXiv:1812.10479 (2018)."},{"key":"e_1_3_2_2_59_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Saxena Shreyas","year":"2019","unstructured":"Shreyas Saxena, Oncel Tuzel, and Dennis DeCoste. 2019. Data parameters: A new family of parameters for learning a differentiable curriculum. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1015472306888"},{"key":"e_1_3_2_2_61_1","volume-title":"Multi-task learning as multi-objective optimization. Advances in neural information processing systems","author":"Sener Ozan","year":"2018","unstructured":"Ozan Sener and Vladlen Koltun. 2018. Multi-task learning as multi-objective optimization. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_2_62_1","volume-title":"8th ICML Workshop on Automated Machine Learning (AutoML).","author":"Shi Xingjian","year":"2021","unstructured":"Xingjian Shi, Jonas Mueller, Nick Erickson, Mu Li, and Alex Smola. 2021. Multimodal automl on structured tables with text fields. In 8th ICML Workshop on Automated Machine Learning (AutoML)."},{"key":"e_1_3_2_2_63_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Sinha Samarth","year":"2020","unstructured":"Samarth Sinha, Animesh Garg, and Hugo Larochelle. 2020. Curriculum By Smoothing. Advances in Neural Information Processing Systems, Vol. 33 (2020)."},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01611-x"},{"key":"e_1_3_2_2_65_1","volume-title":"Human Language Technologies: The 2010 Annual Conference of the North American Chapter of the Association for Computational Linguistics. 751--759","author":"Spitkovsky Valentin I","year":"2010","unstructured":"Valentin I Spitkovsky, Hiyan Alshawi, and Dan Jurafsky. 2010. From baby steps to leapfrog: How ?less is more\" in unsupervised dependency parsing. In Human Language Technologies: The 2010 Annual Conference of the North American Chapter of the Association for Computational Linguistics. 751--759."},{"key":"e_1_3_2_2_66_1","volume-title":"A corpus for reasoning about natural language grounded in photographs. arXiv preprint arXiv:1811.00491","author":"Suhr Alane","year":"2018","unstructured":"Alane Suhr, Stephanie Zhou, Ally Zhang, Iris Zhang, Huajun Bai, and Yoav Artzi. 2018. A corpus for reasoning about natural language grounded in photographs. arXiv preprint arXiv:1811.00491 (2018)."},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2021.3101421"},{"key":"e_1_3_2_2_68_1","volume-title":"International conference on machine learning. PMLR, 10347--10357","author":"Touvron Hugo","year":"2021","unstructured":"Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, and Herv\u00e9 J\u00e9gou. 2021. Training data-efficient image transformers & distillation through attention. In International conference on machine learning. PMLR, 10347--10357."},{"key":"e_1_3_2_2_69_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV) Workshops. 0-0.","author":"Vielzeuf Valentin","year":"2018","unstructured":"Valentin Vielzeuf, Alexis Lechervy, St\u00e9phane Pateux, and Fr\u00e9d\u00e9ric Jurie. 2018. Centralnet: a multilayer approach for multimodal fusion. In Proceedings of the European Conference on Computer Vision (ECCV) Workshops. 0-0."},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3069908"},{"key":"e_1_3_2_2_72_1","volume-title":"International Conference on Machine Learning. PMLR, 5238--5246","author":"Weinshall Daphna","year":"2018","unstructured":"Daphna Weinshall, Gad Cohen, and Dan Amir. 2018. Curriculum learning by transfer learning: Theory and experiments with deep networks. In International Conference on Machine Learning. PMLR, 5238--5246."},{"key":"e_1_3_2_2_73_1","volume-title":"International Conference on Machine Learning. PMLR, 24043--24055","author":"Wu Nan","year":"2022","unstructured":"Nan Wu, Stanislaw Jastrzebski, Kyunghyun Cho, and Krzysztof J Geras. 2022. Characterizing and overcoming the greedy nature of learning in multi-modal deep neural networks. In International Conference on Machine Learning. PMLR, 24043--24055."},{"key":"e_1_3_2_2_74_1","volume-title":"Visual entailment: A novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706","author":"Xie Ning","year":"2019","unstructured":"Ning Xie, Farley Lai, Derek Doran, and Asim Kadav. 2019. Visual entailment: A novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706 (2019)."},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.754"},{"key":"e_1_3_2_2_76_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_2_77_1","volume-title":"Proceedings, Part II 14","author":"Yu Licheng","year":"2016","unstructured":"Licheng Yu, Patrick Poirson, Shan Yang, Alexander C Berg, and Tamara L Berg. 2016. Modeling context in referring expressions. In Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part II 14. Springer, 69--85."},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/35.41402"},{"key":"e_1_3_2_2_79_1","volume-title":"Mosi: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv preprint arXiv:1606.06259","author":"Zadeh Amir","year":"2016","unstructured":"Amir Zadeh, Rowan Zellers, Eli Pincus, and Louis-Philippe Morency. 2016. Mosi: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv preprint arXiv:1606.06259 (2016)."},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"e_1_3_2_2_81_1","volume-title":"Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv preprint arXiv:1612.03928","author":"Zagoruyko Sergey","year":"2016","unstructured":"Sergey Zagoruyko and Nikos Komodakis. 2016. Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv preprint arXiv:1612.03928 (2016)."},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00454"},{"key":"e_1_3_2_2_83_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6513"},{"key":"e_1_3_2_2_84_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2969791"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612468","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612468","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:05:35Z","timestamp":1755821135000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612468"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":84,"alternative-id":["10.1145\/3581783.3612468","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612468","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}