{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:08:31Z","timestamp":1755839311543,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,4,25]],"date-time":"2022-04-25T00:00:00Z","timestamp":1650844800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,4,25]]},"DOI":"10.1145\/3485447.3512021","type":"proceedings-article","created":{"date-parts":[[2022,4,25]],"date-time":"2022-04-25T05:13:07Z","timestamp":1650863587000},"page":"3009-3019","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Can Small Heads Help? Understanding and Improving Multi-Task Generalization"],"prefix":"10.1145","author":[{"given":"Yuyan","family":"Wang","sequence":"first","affiliation":[{"name":"Google Research, USA"}]},{"given":"Zhe","family":"Zhao","sequence":"additional","affiliation":[{"name":"Google Research, USA"}]},{"given":"Bo","family":"Dai","sequence":"additional","affiliation":[{"name":"Google Research, USA"}]},{"given":"Christopher","family":"Fifty","sequence":"additional","affiliation":[{"name":"Google Research, USA"}]},{"given":"Dong","family":"Lin","sequence":"additional","affiliation":[{"name":"Google Research, USA"}]},{"given":"Lichan","family":"Hong","sequence":"additional","affiliation":[{"name":"Google Research, USA"}]},{"given":"Li","family":"Wei","sequence":"additional","affiliation":[{"name":"Google Research, USA"}]},{"given":"Ed H.","family":"Chi","sequence":"additional","affiliation":[{"name":"Google Research, USA"}]}],"member":"320","published-online":{"date-parts":[[2022,4,25]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Rohan Anil Gabriel Pereyra Alexandre Passos Robert Ormandi George\u00a0E Dahl and Geoffrey\u00a0E Hinton. 2018. Large scale distributed neural network training through online distillation. arXiv preprint arXiv:1804.03235(2018)."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the 34th International Conference on Machine Learning-Volume 70","author":"Arik Sercan\u00a0\u00d6","year":"2017","unstructured":"Sercan\u00a0\u00d6 Arik, Mike Chrzanowski, Adam Coates, Gregory Diamos, Andrew Gibiansky, Yongguo Kang, Xian Li, John Miller, Andrew Ng, Jonathan Raiman, 2017. Deep voice: Real-time neural text-to-speech. In Proceedings of the 34th International Conference on Machine Learning-Volume 70. JMLR. org, 195\u2013204."},{"key":"e_1_3_2_1_3_1","volume-title":"A note on weighted criteria methods for compromise solutions in multi-objective optimization. Engineering optimization 27, 2","author":"Athan Timothy\u00a0Ward","year":"1996","unstructured":"Timothy\u00a0Ward Athan and Panos\u00a0Y Papalambros. 1996. A note on weighted criteria methods for compromise solutions in multi-objective optimization. Engineering optimization 27, 2 (1996), 155\u2013176."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959180"},{"volume-title":"Learning to learn","author":"Baxter Jonathan","key":"e_1_3_2_1_5_1","unstructured":"Jonathan Baxter. 1998. Theoretical models of learning to learn. In Learning to learn. Springer, 71\u201394."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.731"},{"key":"e_1_3_2_1_7_1","volume-title":"Multitask learning. Machine learning 28, 1","author":"Caruana Rich","year":"1997","unstructured":"Rich Caruana. 1997. Multitask learning. Machine learning 28, 1 (1997), 41\u201375."},{"key":"e_1_3_2_1_8_1","unstructured":"Rich Caruana and Virginia\u00a0R De\u00a0Sa. 1997. Promoting poor features to supervisors: Some inputs work better as outputs. In Advances in Neural Information Processing Systems. 389\u2013395."},{"key":"e_1_3_2_1_9_1","volume-title":"Gradnorm: Gradient normalization for adaptive loss balancing in deep multitask networks. arXiv preprint arXiv:1711.02257(2017).","author":"Chen Zhao","year":"2017","unstructured":"Zhao Chen, Vijay Badrinarayanan, Chen-Yu Lee, and Andrew Rabinovich. 2017. Gradnorm: Gradient normalization for adaptive loss balancing in deep multitask networks. arXiv preprint arXiv:1711.02257(2017)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Zhe Chen Yuyan Wang Dong Lin Derek\u00a0Zhiyuan Cheng Lichan Hong Ed\u00a0H Chi and Claire Cui. 2020. Beyond Point Estimate: Inferring Ensemble Prediction Variation from Neuron Activation Strength in Recommender Systems. arXiv preprint arXiv:2008.07032(2020).","DOI":"10.1145\/3437963.3441770"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390177"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.2606"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959190"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.crma.2012.03.014"},{"key":"e_1_3_2_1_15_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805(2018).","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805(2018)."},{"key":"e_1_3_2_1_16_1","volume-title":"You Only Train Once: Loss-Conditional Training of Deep Networks. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HyxY6JHKwr","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy and Josip Djolonga. 2020. You Only Train Once: Loss-Conditional Training of Deep Networks. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HyxY6JHKwr"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.5555\/3305381.3305498"},{"key":"e_1_3_2_1_18_1","unstructured":"Yaroslav Ganin and Victor Lempitsky. 2014. Unsupervised domain adaptation by backpropagation. arXiv preprint arXiv:1409.7495(2014)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.apm.2015.03.022"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"F\u00a0Maxwell Harper and Joseph\u00a0A Konstan. 2015. The movielens datasets: History and context. Acm transactions on interactive intelligent systems (tiis) 5 4(2015) 1\u201319.","DOI":"10.1145\/2827872"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Kazuma Hashimoto Caiming Xiong Yoshimasa Tsuruoka and Richard Socher. 2016. A joint many-task model: Growing a neural network for multiple nlp tasks. arXiv preprint arXiv:1611.01587(2016).","DOI":"10.18653\/v1\/D17-1206"},{"key":"e_1_3_2_1_23_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531(2015)."},{"volume-title":"Practical goal programming. Vol.\u00a0141","author":"Jones Dylan","key":"e_1_3_2_1_24_1","unstructured":"Dylan Jones, Mehrdad Tamiz, 2010. Practical goal programming. Vol.\u00a0141. Springer."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 7482\u20137491","author":"Kendall Alex","year":"2018","unstructured":"Alex Kendall, Yarin Gal, and Roberto Cipolla. 2018. Multi-task learning using uncertainty to weigh losses for scene geometry and semantics. In Proceedings of the IEEE conference on computer vision and pattern recognition. 7482\u20137491."},{"volume-title":"Algorithms for optimization","author":"Kochenderfer J","key":"e_1_3_2_1_26_1","unstructured":"Mykel\u00a0J Kochenderfer and Tim\u00a0A Wheeler. 2019. Algorithms for optimization. Mit Press."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"e_1_3_2_1_28_1","unstructured":"Xi Lin Hui-Ling Zhen Zhenhua Li Qing-Fu Zhang and Sam Kwong. 2019. Pareto Multi-Task Learning. In Advances in Neural Information Processing Systems. 12037\u201312047."},{"key":"e_1_3_2_1_29_1","unstructured":"Xiaodong Liu Jianfeng Gao Xiaodong He Li Deng Kevin Duh and Ye-Yi Wang. 2015. Representation learning using multi-task deep neural networks for semantic classification and information retrieval. (2015)."},{"key":"e_1_3_2_1_30_1","unstructured":"Xiaodong Liu Pengcheng He Weizhu Chen and Jianfeng Gao. 2019. Improving multi-task deep neural networks via knowledge distillation for natural language understanding. arXiv preprint arXiv:1904.09482(2019)."},{"key":"e_1_3_2_1_31_1","unstructured":"Xiaodong Liu Pengcheng He Weizhu Chen and Jianfeng Gao. 2019. Multi-task deep neural networks for natural language understanding. arXiv preprint arXiv:1901.11504(2019)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.126"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301216"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3220007"},{"key":"e_1_3_2_1_35_1","unstructured":"Bryan McCann Nitish\u00a0Shirish Keskar Caiming Xiong and Richard Socher. 2018. The natural language decathlon: Multitask learning as question answering. arXiv preprint arXiv:1806.08730(2018)."},{"key":"e_1_3_2_1_36_1","volume-title":"Pseudo-task Augmentation: From Deep Multitask Learning to Intratask Sharing\u2014and Back. arXiv preprint arXiv:1803.04062(2018).","author":"Meyerson Elliot","year":"2018","unstructured":"Elliot Meyerson and Risto Miikkulainen. 2018. Pseudo-task Augmentation: From Deep Multitask Learning to Intratask Sharing\u2014and Back. arXiv preprint arXiv:1803.04062(2018)."},{"volume-title":"Nonlinear multiobjective optimization. Vol.\u00a012","author":"Miettinen Kaisa","key":"e_1_3_2_1_37_1","unstructured":"Kaisa Miettinen. 2012. Nonlinear multiobjective optimization. Vol.\u00a012. Springer Science & Business Media."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.433"},{"volume-title":"Non-convex multi-objective optimization","author":"Pardalos M","key":"e_1_3_2_1_39_1","unstructured":"Panos\u00a0M Pardalos, Antanas \u017dilinskas, and Julius \u017dilinskas. 2017. Non-convex multi-objective optimization. Springer."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Marek Rei. 2017. Semi-supervised multitask learning for sequence labeling. arXiv preprint arXiv:1704.07156(2017).","DOI":"10.18653\/v1\/P17-1194"},{"key":"e_1_3_2_1_41_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems. 91\u201399."},{"key":"e_1_3_2_1_42_1","unstructured":"Sebastian Ruder. 2017. An overview of multi-task learning in deep neural networks. arXiv preprint arXiv:1706.05098(2017)."},{"volume-title":"Theory of multiobjective optimization","author":"Sawaragi Yoshikazu","key":"e_1_3_2_1_43_1","unstructured":"Yoshikazu Sawaragi, Hirotaka Nakayama, and Tetesuzo Tanino. 1985. Theory of multiobjective optimization. Elsevier."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the first international conference on genetic algorithms and their applications","author":"Schaffer J\u00a0David","year":"1985","unstructured":"J\u00a0David Schaffer. 1985. Multiple objective optimization with vector evaluated genetic algorithms. In Proceedings of the first international conference on genetic algorithms and their applications, 1985. Lawrence Erlbaum Associates. Inc., Publishers."},{"key":"e_1_3_2_1_45_1","unstructured":"Ozan Sener and Vladlen Koltun. 2018. Multi-task learning as multi-objective optimization. In Advances in Neural Information Processing Systems. 527\u2013538."},{"key":"e_1_3_2_1_46_1","unstructured":"Trevor Standley Amir\u00a0R Zamir Dawn Chen Leonidas Guibas Jitendra Malik and Silvio Savarese. 2019. Which Tasks Should Be Learned Together in Multi-task Learning?arXiv preprint arXiv:1905.07553(2019)."},{"key":"e_1_3_2_1_47_1","unstructured":"Simon Vandenhende Stamatios Georgoulis Bert De\u00a0Brabandere and Luc Van\u00a0Gool. 2019. Branched multi-task networks: deciding what layers to share. arXiv preprint arXiv:1904.02920(2019)."},{"key":"e_1_3_2_1_48_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998\u20136008."},{"key":"e_1_3_2_1_49_1","unstructured":"Sen Wu Hongyang\u00a0R Zhang and Christopher R\u00e9. 2020. Understanding and Improving Information Transfer in Multi-Task Learning. arXiv preprint arXiv:2005.00944(2020)."},{"key":"e_1_3_2_1_50_1","unstructured":"Han Xiao Kashif Rasul and Roland Vollgraf. 2017. Fashion-mnist: a novel image dataset for benchmarking machine learning algorithms. arXiv preprint arXiv:1708.07747(2017)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF00932614"},{"key":"e_1_3_2_1_52_1","unstructured":"Tianhe Yu Saurabh Kumar Abhishek Gupta Sergey Levine Karol Hausman and Chelsea Finn. 2020. Gradient surgery for multi-task learning. arXiv preprint arXiv:2001.06782(2020)."},{"key":"e_1_3_2_1_53_1","unstructured":"Chiyuan Zhang Samy Bengio Moritz Hardt Benjamin Recht and Oriol Vinyals. 2016. Understanding deep learning requires rethinking generalization. arXiv preprint arXiv:1611.03530(2016)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_7"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3298689.3346997"}],"event":{"name":"WWW '22: The ACM Web Conference 2022","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Virtual Event, Lyon France","acronym":"WWW '22"},"container-title":["Proceedings of the ACM Web Conference 2022"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3485447.3512021","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3485447.3512021","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:30:06Z","timestamp":1750188606000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3485447.3512021"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,4,25]]},"references-count":55,"alternative-id":["10.1145\/3485447.3512021","10.1145\/3485447"],"URL":"https:\/\/doi.org\/10.1145\/3485447.3512021","relation":{},"subject":[],"published":{"date-parts":[[2022,4,25]]},"assertion":[{"value":"2022-04-25","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}