{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T09:59:28Z","timestamp":1775815168363,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,9,14]],"date-time":"2023-09-14T00:00:00Z","timestamp":1694649600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,9,14]]},"DOI":"10.1145\/3604915.3608831","type":"proceedings-article","created":{"date-parts":[[2023,9,14]],"date-time":"2023-09-14T22:40:23Z","timestamp":1694731223000},"page":"791-797","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Enhancing Transformers without Self-supervised Learning: A Loss Landscape Perspective in Sequential Recommendation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3165-6136","authenticated-orcid":false,"given":"Vivian","family":"Lai","sequence":"first","affiliation":[{"name":"Visa Research, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6360-558X","authenticated-orcid":false,"given":"Huiyuan","family":"Chen","sequence":"additional","affiliation":[{"name":"Research, Visa Research, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9807-2963","authenticated-orcid":false,"given":"Chin-Chia Michael","family":"Yeh","sequence":"additional","affiliation":[{"name":"Visa Research, Visa Research, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7746-2468","authenticated-orcid":false,"given":"Minghua","family":"Xu","sequence":"additional","affiliation":[{"name":"Visa Research, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3532-4323","authenticated-orcid":false,"given":"Yiwei","family":"Cai","sequence":"additional","affiliation":[{"name":"Visa Research, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3020-9828","authenticated-orcid":false,"given":"Hao","family":"Yang","sequence":"additional","affiliation":[{"name":"Visa Research, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,9,14]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"International Conference on Machine Learning.","author":"Andriushchenko Maksym","year":"2022","unstructured":"Maksym Andriushchenko and Nicolas Flammarion. 2022. Towards understanding sharpness-aware minimization. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3546788"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460231.3474258"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3512162"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531763"},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Learning Representations.","author":"Chen Xiangning","year":"2022","unstructured":"Xiangning Chen, Cho-Jui Hsieh, and Boqing Gong. 2022. When Vision Transformers Outperform ResNets without Pre-training or Strong Data Augmentations. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3512090"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460231.3474255"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. 4171\u20134186","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. 4171\u20134186."},{"key":"e_1_3_2_1_10_1","volume-title":"Sharpness-aware Minimization for Efficiently Improving Generalization. In International Conference on Learning Representations.","author":"Foret Pierre","year":"2021","unstructured":"Pierre Foret, Ariel Kleiner, Hossein Mobahi, and Behnam Neyshabur. 2021. Sharpness-aware Minimization for Efficiently Improving Generalization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_11_1","volume-title":"large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. 2017. Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677 (2017)."},{"key":"e_1_3_2_1_12_1","volume-title":"Session-based Recommendations with Recurrent Neural Networks. In International Conference on Learning Representations.","author":"Hidasi Bal\u00e1zs","year":"2016","unstructured":"Bal\u00e1zs Hidasi, Alexandros Karatzoglou, Linas Baltrunas, and Domonkos Tikk. 2016. Session-based Recommendations with Recurrent Neural Networks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Learning Representations.","author":"Jiang Yiding","year":"2020","unstructured":"Yiding Jiang, Behnam Neyshabur, Hossein Mobahi, Dilip Krishnan, and Samy Bengio. 2020. Fantastic Generalization Measures and Where to Find Them. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2018.00035"},{"key":"e_1_3_2_1_15_1","volume-title":"On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima. In International Conference on Learning Representations.","author":"Keskar Nitish\u00a0Shirish","year":"2017","unstructured":"Nitish\u00a0Shirish Keskar, Dheevatsa Mudigere, Jorge Nocedal, Mikhail Smelyanskiy, and Ping Tak\u00a0Peter Tang. 2017. On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Machine Learning. 11148\u201311161","author":"Kim Minyoung","year":"2022","unstructured":"Minyoung Kim, Da Li, Shell\u00a0X Hu, and Timothy Hospedales. 2022. Fisher sam: Information geometry and sharpness aware minimisation. In International Conference on Machine Learning. 11148\u201311161."},{"key":"e_1_3_2_1_17_1","volume-title":"Matrix factorization techniques for recommender systems. Computer","author":"Koren Yehuda","year":"2009","unstructured":"Yehuda Koren, Robert Bell, and Chris Volinsky. 2009. Matrix factorization techniques for recommender systems. Computer (2009), 30\u201337."},{"key":"e_1_3_2_1_18_1","volume-title":"International Conference on Machine Learning. 5905\u20135914","author":"Kwon Jungmin","year":"2021","unstructured":"Jungmin Kwon, Jeongseop Kim, Hyunseo Park, and In\u00a0Kwon Choi. 2021. Asam: Adaptive sharpness-aware minimization for scale-invariant learning of deep neural networks. In International Conference on Machine Learning. 5905\u20135914."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501999"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610206"},{"key":"e_1_3_2_1_21_1","unstructured":"Hao Li Zheng Xu Gavin Taylor Christoph Studer and Tom Goldstein. 2018. Visualizing the loss landscape of neural nets. Advances in neural information processing systems."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3336191.3371786"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403252"},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Machine Learning. 6094\u20136104","author":"Lin Tao","year":"2020","unstructured":"Tao Lin, Lingjing Kong, Sebastian Stich, and Martin Jaggi. 2020. Extrapolation for large-batch training in deep learning. In International Conference on Machine Learning. 6094\u20136104."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01204"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3463036"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403091"},{"key":"e_1_3_2_1_28_1","volume-title":"Revisiting small batch training for deep neural networks. arXiv preprint arXiv:1804.07612","author":"Masters Dominic","year":"2018","unstructured":"Dominic Masters and Carlo Luschi. 2018. Revisiting small batch training for deep neural networks. arXiv preprint arXiv:1804.07612 (2018)."},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Learning Representations.","author":"M\u00f6llenhoff Thomas","year":"2023","unstructured":"Thomas M\u00f6llenhoff and Mohammad\u00a0Emtiyaz Khan. 2023. SAM as an Optimal Relaxation of Bayes. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_30_1","volume-title":"Conference on Learning Theory. 3526\u20133545","author":"Neu Gergely","year":"2021","unstructured":"Gergely Neu, Gintare\u00a0Karolina Dziugaite, Mahdi Haghifam, and Daniel\u00a0M Roy. 2021. Information-theoretic generalization bounds for stochastic gradient descent. In Conference on Learning Theory. 3526\u20133545."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3488560.3498433"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence. 452\u2013461","author":"Rendle Steffen","year":"2009","unstructured":"Steffen Rendle, Christoph Freudenthaler, Zeno Gantner, and Lars Schmidt-Thieme. 2009. BPR: Bayesian personalized ranking from implicit feedback. In Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence. 452\u2013461."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/1772690.1772773"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3357895"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3159652.3159656"},{"key":"e_1_3_2_1_36_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998\u20136008."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599347"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539404"},{"key":"e_1_3_2_1_39_1","volume-title":"International Conference on Learning Representations.","author":"Wen Kaiyue","year":"2023","unstructured":"Kaiyue Wen, Tengyu Ma, and Zhiyuan Li. 2023. How Sharpness-Aware Minimization Minimizes Sharpness?. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383313.3412258"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE53745.2022.00099"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539068"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3411954"}],"event":{"name":"RecSys '23: Seventeenth ACM Conference on Recommender Systems","location":"Singapore Singapore","acronym":"RecSys '23","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGAI ACM Special Interest Group on Artificial Intelligence","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGIR ACM Special Interest Group on Information Retrieval","SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGecom Special Interest Group on Economics and Computation"]},"container-title":["Proceedings of the 17th ACM Conference on Recommender Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3604915.3608831","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3604915.3608831","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:07Z","timestamp":1750178767000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3604915.3608831"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,14]]},"references-count":43,"alternative-id":["10.1145\/3604915.3608831","10.1145\/3604915"],"URL":"https:\/\/doi.org\/10.1145\/3604915.3608831","relation":{},"subject":[],"published":{"date-parts":[[2023,9,14]]},"assertion":[{"value":"2023-09-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}