{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,1,18]],"date-time":"2025-01-18T05:07:43Z","timestamp":1737176863936,"version":"3.33.0"},"reference-count":75,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,15]]},"DOI":"10.1109\/bigdata62323.2024.10825379","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:23Z","timestamp":1737052283000},"page":"1029-1038","source":"Crossref","is-referenced-by-count":0,"title":["Fast Second-order Method for Neural Networks under Small Treewidth Setting"],"prefix":"10.1109","author":[{"given":"Xiaoyu","family":"Li","sequence":"first","affiliation":[{"name":"Stevens Institute of Technology,Department of Computer Science,Hoboken,NJ"}]},{"given":"Jiangxuan","family":"Long","sequence":"additional","affiliation":[{"name":"South China University of Technology,School of Software Engineering,Guangzhou,China"}]},{"given":"Zhao","family":"Song","sequence":"additional","affiliation":[{"name":"University of California, Berkeley,The Simons Institute for the Theory of Computing,Berkeley,CA"}]},{"given":"Tianyi","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Southern California,Department of Computer Science,Los Angeles,CA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3446776"},{"key":"ref2","first-page":"1225","article-title":"Train faster, generalize better: Stability of stochastic gradient descent","volume-title":"International conference on machine learning","author":"Hardt"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1137\/15M1021106"},{"issue":"7","key":"ref4","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi","year":"2011","journal-title":"Journal of machine learning research"},{"article-title":"Adam: A method for stochastic optimization","year":"2014","author":"Kingma","key":"ref5"},{"article-title":"Gradient descent provably optimizes over-parameterized neural networks","year":"2019","author":"Du","key":"ref6"},{"article-title":"Quadratic suffices for over-parametrization via matrix chernoff bound","year":"2019","author":"Song","key":"ref7"},{"article-title":"Global convergence of adaptive gradient methods for an over-parameterized neural network","year":"2019","author":"Wu","key":"ref8"},{"article-title":"Gram-gauss-newton method: Learning overparameterized neural networks for regression problems","year":"2019","author":"Cai","key":"ref9"},{"key":"ref10","article-title":"Fast convergence of natural gradient descent for over-parameterized neural networks","volume":"32","author":"Zhang","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref11","article-title":"Training (overparametrized) neural networks in near-linear time","author":"Brand","year":"2021","journal-title":"ITCS"},{"key":"ref12","article-title":"Learning overparameterized neural networks via stochastic gradient descent on structured data","volume":"31","author":"Li","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref13","first-page":"242","article-title":"A convergence theory for deep learning via over-parameterization","volume-title":"International Conference on Machine Learning","author":"Allen-Zhu"},{"key":"ref14","article-title":"On the convergence rate of training recurrent neural networks","volume":"32","author":"Allen-Zhu","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1561\/2200000050"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-020-01516-y"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1006\/jagm.1995.1009"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1137\/1.9780898718881"},{"article-title":"A faster small treewidth sdp solver","year":"2022","author":"Gu","key":"ref19"},{"key":"ref20","first-page":"735","article-title":"Deep learning via hessian-free optimization","volume-title":"Proceedings of the 27th International Conference on International Conference on Machine Learning","author":"Martens"},{"key":"ref21","first-page":"1261","article-title":"Krylov subspace descent for deep learning","volume-title":"Artificial intelligence and statistics.","author":"Vinyals","year":"2012"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1162\/089976698300017746"},{"key":"ref23","first-page":"2408","article-title":"Optimizing neural networks with kronecker-factored approximate curvature","volume-title":"International conference on machine learning","author":"Martens"},{"key":"ref24","first-page":"573","article-title":"A kronecker-factored approximate fisher matrix for convolution layers","volume-title":"International Conference on Machine Learning","author":"Grosse"},{"article-title":"Distributed second-order optimization using kronecker-factored approximations","volume-title":"International conference on learning representations","author":"Ba","key":"ref25"},{"article-title":"A unified scheme of resnet and softmax","year":"2023","author":"Song","key":"ref26"},{"article-title":"A fast optimization view: Reformulating single layer attention in llm based on tensor and svm trick, and solving it in matrix multiplication time","year":"2023","author":"Gao","key":"ref27"},{"article-title":"Federated empirical risk minimization via second-order method","year":"2023","author":"Bian","key":"ref28"},{"article-title":"Local convergence of approximate newton method for two layer nonlinear regression","year":"2023","author":"Li","key":"ref29"},{"key":"ref30","article-title":"Wide neural networks of any depth evolve as linear models under gradient descent","volume":"32","author":"Lee","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref31","first-page":"10 775","article-title":"Generalized leverage score sampling for neural networks","volume":"33","author":"Lee","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/JSAIT.2020.2991332"},{"key":"ref33","first-page":"4423","article-title":"Fl-ntk: A neural tangent kernel-based framework for federated learning analysis","volume-title":"International Conference on Machine Learning","author":"Huang"},{"key":"ref34","first-page":"22 890","article-title":"Does preprocessing help training over-parameterized neural networks?","volume":"34","author":"Song","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Looped relu mlps may be all you need as practical programmable computers","year":"2024","author":"Liang","key":"ref35"},{"article-title":"Advancing the understanding of fixed point iterations in deep neural networks: A detailed analytical study","year":"2024","author":"Ke","key":"ref36"},{"article-title":"Outlier-efficient hopfield layers for large transformer-based models","volume-title":"Forty-first International Conference on Machine Learning (ICML)","author":"Hu","key":"ref37"},{"article-title":"Nonparametric modern hopfield models","year":"2024","author":"Hu","key":"ref38"},{"article-title":"Decoupled alignment for robust plug-and-play adaptation","year":"2024","author":"Luo","key":"ref39"},{"article-title":"Enhancing jailbreak attack against large language models through silent tokens","year":"2024","author":"Yu","key":"ref40"},{"article-title":"Computational limits of low-rank adaptation (lora) for transformer-based models","year":"2024","author":"Hu","key":"ref41"},{"article-title":"Differentially private kernel density estimation","year":"2024","author":"Liu","key":"ref42"},{"article-title":"On statistical rates and provably efficient criteria of latent diffusion transformers (dits)","volume-title":"Thirty-eighth Conference on Neural Information Processing Systems (NeurIPS)","author":"Hu","key":"ref43"},{"article-title":"Provably optimal memory capacity for modern hopfield models: Transformer-compatible dense associative memories as spherical codes","volume-title":"Thirty-eighth Conference on Neural Information Processing Systems (NeurIPS)","author":"Hu","key":"ref44"},{"key":"ref45","article-title":"Neural tangent kernel: Convergence and generalization in neural networks","volume":"31","author":"Jacot","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref46","first-page":"9835","article-title":"Oblivious sketching-based central path method for linear programming","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Song"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1561\/0400000060"},{"article-title":"Quantum speedups for approximating the john ellipsoid","year":"2024","author":"Li","key":"ref48"},{"article-title":"Fast john ellipsoid computation with differential privacy optimization","year":"2024","author":"Li","key":"ref49"},{"key":"ref50","first-page":"32 418","article-title":"Sketching meets differential privacy: fast algorithm for dynamic kronecker projection maintenance","volume-title":"International Conference on Machine Learning","author":"Song"},{"article-title":"Training overparametrized neural networks in sublinear time","year":"2022","author":"Deng","key":"ref51"},{"article-title":"A sublinear adversarial training algorithm","volume-title":"The Twelfth International Conference on Learning Representations","author":"Gao","key":"ref52"},{"key":"ref53","first-page":"19932","article-title":"Federated adversarial learning: A framework with convergence analysis","volume-title":"International Conference on Machine Learning","author":"Li"},{"article-title":"Differential privacy mechanisms in neural tangent kernel regression","year":"2024","author":"Liang","key":"ref54"},{"article-title":"Differential privacy of cross-attention with provable guarantee","year":"2024","author":"Liang","key":"ref55"},{"article-title":"Hsr-enhanced sparse attention acceleration","year":"2024","author":"Chen","key":"ref56"},{"article-title":"Beyond linear approximations: A novel pruning approach for attention matrix","year":"2024","author":"Liang","key":"ref57"},{"article-title":"Differentially private attention computation","volume-title":"Neurips Safe Generative AI Workshop 2024","author":"Gao","key":"ref58"},{"article-title":"Multi-layer transformers gradient can be approximated in almost linear time","year":"2024","author":"Liang","key":"ref59"},{"article-title":"Fine-grained attention i\/o complexity: Comprehensive analysis for backward passes","year":"2024","author":"Li","key":"ref60"},{"article-title":"A tighter complexity analysis of sparsegpt","year":"2024","author":"Li","key":"ref61"},{"article-title":"Bypassing the exponential dependency: Looped transformers efficiently learn in-context by multistep gradient descent","year":"2024","author":"Chen","key":"ref62"},{"key":"ref63","article-title":"Graph neural tangent kernel: Fusing graph neural networks with graph kernels","volume":"32","author":"Du","year":"2019","journal-title":"Advances in neural information processing systems"},{"article-title":"Is solving graph neural tangent kernel equivalent to training graph neural network?","year":"2023","author":"Qin","key":"ref64"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679675"},{"key":"ref66","first-page":"17827","article-title":"Graph neural tangent kernel: Convergence on large graphs","volume-title":"International Conference on Machine Learning","author":"Krishnagopal"},{"article-title":"Tensor attention training: Provably efficient learning of higher-order transformers","year":"2024","author":"Liang","key":"ref67"},{"article-title":"Exploring the frontiers of softmax: Provable optimization, applications in diffusion model, and beyond","year":"2024","author":"Li","key":"ref68"},{"article-title":"Unraveling the smoothness properties of diffusion models: A gaussian mixture perspective","year":"2024","author":"Liang","key":"ref69"},{"article-title":"Unraveling the smoothness properties of diffusion models: A gaussian mixture perspective","year":"2024","author":"Liang","key":"ref70"},{"article-title":"Towards few-shot adaptation of foundation models via multitask finetuning","volume-title":"The Twelfth International Conference on Learning Representations","author":"Xu","key":"ref71"},{"key":"ref72","first-page":"179","article-title":"Domain generalization via nuclear norm regularization","volume-title":"Conference on Parsimony and Learning","author":"Shi"},{"key":"ref73","article-title":"Computer solution of sparse linear systems","author":"George","year":"1994","journal-title":"Oak Ridge National Laboratory"},{"article-title":"A nearly-linear time algorithm for structured support vector machines","year":"2023","author":"Gu","key":"ref74"},{"key":"ref75","article-title":"Learning and generalization in overparameterized neural networks, going beyond two layers","volume":"32","author":"Allen-Zhu","year":"2019","journal-title":"Advances in neural information processing systems"}],"event":{"name":"2024 IEEE International Conference on Big Data (BigData)","start":{"date-parts":[[2024,12,15]]},"location":"Washington, DC, USA","end":{"date-parts":[[2024,12,18]]}},"container-title":["2024 IEEE International Conference on Big Data (BigData)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10824975\/10824942\/10825379.pdf?arnumber=10825379","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T08:18:38Z","timestamp":1737101918000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10825379\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,15]]},"references-count":75,"URL":"https:\/\/doi.org\/10.1109\/bigdata62323.2024.10825379","relation":{},"subject":[],"published":{"date-parts":[[2024,12,15]]}}}