{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:12:50Z","timestamp":1778080370600,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3771045","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:06:16Z","timestamp":1765008376000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Gradient Shaping Beyond Clipping: A Functional Perspective on Update Magnitude Control"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9178-2912","authenticated-orcid":false,"given":"Haochen","family":"You","sequence":"first","affiliation":[{"name":"Graduate School of Arts and Sciences, Columbia University, New York City, New York, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1444-7267","authenticated-orcid":false,"given":"Baojing","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Hebei Institute of Communications, Shijiazhuang, Hebei Province, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Allouah Youssef","year":"2025","unstructured":"Youssef Allouah, Rachid Guerraoui, Nirupam Gupta, Ahmed Jellouli, Geovani Rizk, and John Stephan. 2025. Adaptive gradient clipping for robust federated learning. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_3_1_3_2","unstructured":"Xiangyi Chen Steven\u00a0Z Wu and Mingyi Hong. 2020. Understanding gradient clipping in private sgd: A geometric perspective. Advances in Neural Information Processing Systems 33 (2020) 13773\u201313782."},{"key":"e_1_3_3_1_4_2","first-page":"794","volume-title":"International conference on machine learning","author":"Chen Zhao","year":"2018","unstructured":"Zhao Chen, Vijay Badrinarayanan, Chen-Yu Lee, and Andrew Rabinovich. 2018. Gradnorm: Gradient normalization for adaptive loss balancing in deep multitask networks. In International conference on machine learning. PMLR, 794\u2013803."},{"key":"e_1_3_3_1_5_2","unstructured":"Eduard Gorbunov Marina Danilova and Alexander Gasnikov. 2020. Stochastic optimization with heavy-tailed noise via accelerated gradient clipping. Advances in Neural Information Processing Systems 33 (2020) 15042\u201315053."},{"key":"e_1_3_3_1_6_2","first-page":"3964","volume-title":"International Conference on Machine Learning","author":"Gurbuzbalaban Mert","year":"2021","unstructured":"Mert Gurbuzbalaban, Umut Simsekli, and Lingjiong Zhu. 2021. The heavy-tail phenomenon in SGD. In International Conference on Machine Learning. PMLR, 3964\u20133975."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3723366.3723369"},{"key":"e_1_3_3_1_8_2","unstructured":"Tianjin Huang Haotian Hu Zhenyu Zhang Gaojie Jin Xiang Li Li Shen Tianlong Chen Lu Liu Qingsong Wen Zhangyang Wang et\u00a0al. 2025. Stable-SPAM: How to Train in 4-Bit More Stably than 16-Bit Adam. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.17055 (2025)."},{"key":"e_1_3_3_1_9_2","unstructured":"Florian H\u00fcbler Ilyas Fatkhullin and Niao He. 2024. From gradient clipping to normalization for heavy tailed sgd. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.13849 (2024)."},{"key":"e_1_3_3_1_10_2","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_1_11_2","first-page":"17343","volume-title":"International Conference on Machine Learning","author":"Koloskova Anastasia","year":"2023","unstructured":"Anastasia Koloskova, Hadrien Hendrikx, and Sebastian\u00a0U Stich. 2023. Revisiting gradient clipping: Stochastic bias and tight convergence guarantees. In International Conference on Machine Learning. PMLR, 17343\u201317363."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Atli Kosson Bettina Messmer and Martin Jaggi. 2024. Analyzing & reducing the need for learning rate warmup in GPT training. Advances in Neural Information Processing Systems 37 (2024) 2914\u20132942.","DOI":"10.52202\/079017-0096"},{"key":"e_1_3_3_1_13_2","unstructured":"Alex Krizhevsky Geoffrey Hinton et\u00a0al. 2009. Learning multiple layers of features from tiny images. (2009)."},{"key":"e_1_3_3_1_14_2","unstructured":"Abhay Kumar Louis Owen Nilabhra\u00a0Roy Chowdhury and Fabian G\u00fcra. 2025. ZClip: Adaptive Spike Mitigation for LLM Pre-Training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.02507 (2025)."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671728"},{"key":"e_1_3_3_1_16_2","unstructured":"Qiang Li Michal Yemini and Hoi-To Wai. 2024. Clipped SGD Algorithms for Performative Prediction: Tight Bounds for Clipping Bias and Remedies. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.10995 (2024)."},{"key":"e_1_3_3_1_17_2","unstructured":"Yuqi Li Kai Li Xin Yin Zhifei Yang Junhao Dong Zeyu Dong Chuanguang Yang Yingli Tian and Yao Lu. 2025. Sepprune: Structured pruning for efficient deep speech separation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.12079 (2025)."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3733294"},{"key":"e_1_3_3_1_19_2","unstructured":"Yuqi Li Yao Lu Zeyu Dong Chuanguang Yang Yihao Chen and Jianping Gou. 2024. Sglp: A similarity guided fast layer partition pruning for compressing large deep models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.14720 (2024)."},{"key":"e_1_3_3_1_20_2","unstructured":"Yuqi Li Chuanguang Yang Hansheng Zeng Zeyu Dong Zhulin An Yongjun Xu Yingli Tian and Hao Wu. 2025. Frequency-aligned knowledge distillation for lightweight spatiotemporal forecasting. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2507.02939 (2025)."},{"key":"e_1_3_3_1_21_2","first-page":"7325","volume-title":"International Conference on Machine Learning","author":"Mai Vien\u00a0V","year":"2021","unstructured":"Vien\u00a0V Mai and Mikael Johansson. 2021. Stability and convergence of stochastic gradient clipping: Beyond lipschitz continuity and smoothness. In International Conference on Machine Learning. PMLR, 7325\u20137335."},{"key":"e_1_3_3_1_22_2","unstructured":"Stephen Merity Caiming Xiong James Bradbury and Richard Socher. 2016. Pointer sentinel mixture models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.07843 (2016)."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Herbert Robbins and Sutton Monro. 1951. A stochastic approximation method. The annals of mathematical statistics (1951) 400\u2013407.","DOI":"10.1214\/aoms\/1177729586"},{"key":"e_1_3_3_1_24_2","unstructured":"Fabian Schaipp Guillaume Garrigos Umut Simsekli and Robert Gower. 2024. SGD with Clipping is Secretly Estimating the Median Gradient. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.12828 (2024)."},{"key":"e_1_3_3_1_25_2","unstructured":"Egor Shulgin and Peter Richt\u00e1rik. 2024. On the Convergence of DP-SGD with Adaptive Clipping. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.19916 (2024)."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D13-1170"},{"key":"e_1_3_3_1_27_2","unstructured":"Matteo Tucat Anirbit Mukherjee Procheta Sen Mingfei Sun and Omar Rivasplata. 2024. Regularized Gradient Clipping Provably Trains Wide and Deep Neural Networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.08624 (2024)."},{"key":"e_1_3_3_1_28_2","unstructured":"Guoxia Wang Shuai Li Congliang Chen Jinle Zeng Jiabin Yang Tao Sun Yanjun Ma Dianhai Yu and Li Shen. 2025. AdaGC: Improving Training Stability for Large Language Model Pretraining. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.11034 (2025)."},{"key":"e_1_3_3_1_29_2","unstructured":"Chengkun Wei Weixian Li Gong Chen and Wenzhi Chen. 2025. DC-SGD: Differentially Private SGD with Dynamic Clipping through Gradient Norm Distribution Estimation. IEEE Transactions on Information Forensics and Security (2025)."},{"key":"e_1_3_3_1_30_2","first-page":"59","volume-title":"International Conference on Neural Information Processing","author":"You Haochen","year":"2024","unstructured":"Haochen You and Baojing Liu. 2024. Application of pseudometric functions in clustering and a novel similarity measure based on path information discrepancy. In International Conference on Neural Information Processing. Springer, 59\u201373."},{"key":"e_1_3_3_1_31_2","unstructured":"Haochen You and Baojing Liu. 2025. Mover: Multimodal optimal transport with volume-based embedding regularization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.12149 (2025)."},{"key":"e_1_3_3_1_32_2","unstructured":"Haochen You Baojing Liu and Hongyang He. 2025. Modular MeanFlow: Towards Stable and Scalable One-Step Generative Modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.17426 (2025)."},{"key":"e_1_3_3_1_33_2","unstructured":"Jingzhao Zhang Tianxing He Suvrit Sra and Ali Jadbabaie. 2019. Why gradient clipping accelerates training: A theoretical justification for adaptivity. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1905.11881 (2019)."},{"key":"e_1_3_3_1_34_2","first-page":"26982","volume-title":"International conference on machine learning","author":"Zhao Yang","year":"2022","unstructured":"Yang Zhao, Hao Zhang, and Xiuyuan Hu. 2022. Penalizing gradient norm for efficiently improving generalization in deep learning. In International conference on machine learning. PMLR, 26982\u201326992."},{"key":"e_1_3_3_1_35_2","unstructured":"Rong Zhu. 2016. Gradient-based sampling: An adaptive importance sampling for least-squares. Advances in neural information processing systems 29 (2016)."}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3771045","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:11:13Z","timestamp":1765008673000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3771045"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":34,"alternative-id":["10.1145\/3743093.3771045","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3771045","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}