{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T05:26:37Z","timestamp":1781587597199,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,9,13]],"date-time":"2022-09-13T00:00:00Z","timestamp":1663027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,9,18]]},"DOI":"10.1145\/3523227.3546788","type":"proceedings-article","created":{"date-parts":[[2022,9,13]],"date-time":"2022-09-13T14:13:46Z","timestamp":1663078426000},"page":"92-101","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":72,"title":["Denoising Self-Attentive Sequential Recommendation"],"prefix":"10.1145","author":[{"given":"Huiyuan","family":"Chen","sequence":"first","affiliation":[{"name":"Visa Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yusan","family":"Lin","sequence":"additional","affiliation":[{"name":"Visa Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Menghai","family":"Pan","sequence":"additional","affiliation":[{"name":"Visa Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lan","family":"Wang","sequence":"additional","affiliation":[{"name":"Visa Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chin-Chia Michael","family":"Yeh","sequence":"additional","affiliation":[{"name":"Visa Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoting","family":"Li","sequence":"additional","affiliation":[{"name":"Visa Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yan","family":"Zheng","sequence":"additional","affiliation":[{"name":"Visa Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fei","family":"Wang","sequence":"additional","affiliation":[{"name":"Visa Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hao","family":"Yang","sequence":"additional","affiliation":[{"name":"Visa Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2022,9,13]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Jasmijn Bastings Wilker Aziz and Ivan Titov. 2019. Interpretable Neural Predictions with Differentiable Binary Variables. In ACL. 2963\u20132977.","DOI":"10.18653\/v1\/P19-1284"},{"key":"e_1_3_2_1_2_1","volume-title":"Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150(2020).","author":"Beltagy Iz","year":"2020","unstructured":"Iz Beltagy, Matthew\u00a0E Peters, and Arman Cohan. 2020. Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150(2020)."},{"key":"e_1_3_2_1_3_1","unstructured":"Yoshua Bengio Nicholas L\u00e9onard and Aaron Courville. 2013. Estimating or propagating gradients through stochastic neurons for conditional computation. arXiv preprint arXiv:1308.3432(2013)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Jianxin Chang Chen Gao Yu Zheng Yiqun Hui Yanan Niu Yang Song Depeng Jin and Yong Li. 2021. Sequential Recommendation with Graph Neural Networks. In SIGIR. 378\u2013387.","DOI":"10.1145\/3404835.3462968"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Huiyuan Chen Yusan Lin Fei Wang and Hao Yang. 2021. Tops bottoms and shoes: building capsule wardrobes via cross-attention tensor network. In RecSys. 453\u2013462.","DOI":"10.1145\/3460231.3474258"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Huiyuan Chen Lan Wang Yusan Lin Chin-Chia\u00a0Michael Yeh Fei Wang and Hao Yang. 2021. Structured graph convolutional networks with stochastic masks for recommender systems. In SIGIR. 614\u2013623.","DOI":"10.1145\/3404835.3462868"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Huiyuan Chen Chin-Chia\u00a0Michael Yeh Fei Wang and Hao Yang. 2022. Graph Neural Transport Networks with Non-local Attentions for Recommender Systems. In WWW. 1955\u20131964.","DOI":"10.1145\/3485447.3512162"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Qiwei Chen Huan Zhao Wei Li Pipei Huang and Wenwu Ou. 2019. Behavior sequence transformer for e-commerce recommendation in alibaba. In DLP-KDD.","DOI":"10.1145\/3326937.3341261"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Xu Chen Hongteng Xu Yongfeng Zhang Jiaxi Tang Yixin Cao Zheng Qin and Hongyuan Zha. 2018. Sequential recommendation with user memory networks. In WSDM. 108\u2013116.","DOI":"10.1145\/3159652.3159668"},{"key":"e_1_3_2_1_10_1","unstructured":"Rewon Child Scott Gray Alec Radford and Ilya Sutskever. 2019. Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509(2019)."},{"key":"e_1_3_2_1_11_1","volume-title":"An Analysis of BERT\u2019s Attention. In ACL Workshop BlackboxNLP. 276\u2013286","author":"Clark Kevin","year":"2019","unstructured":"Kevin Clark, Urvashi Khandelwal, Omer Levy, and Christopher\u00a0D Manning. 2019. What Does BERT Look at? An Analysis of BERT\u2019s Attention. In ACL Workshop BlackboxNLP. 276\u2013286."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Gon\u00e7alo\u00a0M Correia Vlad Niculae and Andr\u00e9\u00a0FT Martins. 2019. Adaptively Sparse Transformers. In EMNLP. 2174\u20132184.","DOI":"10.18653\/v1\/D19-1223"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Nicola De\u00a0Cao Michael\u00a0Sejr Schlichtkrull Wilker Aziz and Ivan Titov. 2020. How Do Decisions Emerge across Layers in Neural Models? Interpretation with Differentiable Masking. In EMNLP. 3243\u20133255.","DOI":"10.18653\/v1\/2020.emnlp-main.262"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Gabriel de Souza Pereira\u00a0Moreira Sara Rabhi Jeong\u00a0Min Lee Ronay Ak and Even Oldridge. 2021. Transformers4Rec: Bridging the Gap between NLP and Sequential\/Session-Based Recommendation. In RecSys.","DOI":"10.1145\/3460231.3474255"},{"key":"e_1_3_2_1_15_1","volume-title":"ARMS: Antithetic-REINFORCE-Multi-Sample Gradient for Binary Variables. In ICML. 2717\u20132727.","author":"Dimitriev Aleksandar","year":"2021","unstructured":"Aleksandar Dimitriev and Mingyuan Zhou. 2021. ARMS: Antithetic-REINFORCE-Multi-Sample Gradient for Binary Variables. In ICML. 2717\u20132727."},{"key":"e_1_3_2_1_16_1","unstructured":"Zhe Dong Andriy Mnih and George Tucker. 2020. DisARM: An Antithetic Gradient Estimator for Binary Latent Variables. In NeurIPS."},{"key":"e_1_3_2_1_17_1","unstructured":"Angela Fan Edouard Grave and Armand Joulin. 2020. Reducing Transformer Depth on Demand with Structured Dropout. In ICLR."},{"key":"e_1_3_2_1_18_1","unstructured":"Qipeng Guo Xipeng Qiu Pengfei Liu Yunfan Shao Xiangyang Xue and Zheng Zhang. 2019. Star-Transformer. In NAACL-HLT. 1315\u20131325."},{"key":"e_1_3_2_1_19_1","unstructured":"Ruining He Wang-Cheng Kang and Julian McAuley. 2017. Translation-based recommendation. In RecSys. 161\u2013169."},{"key":"e_1_3_2_1_20_1","unstructured":"Bal\u00e1zs Hidasi Alexandros Karatzoglou Linas Baltrunas and Domonkos Tikk. 2016. Session-based Recommendations with Recurrent Neural Networks. In ICLR."},{"key":"e_1_3_2_1_21_1","unstructured":"Judy Hoffman Daniel\u00a0A Roberts and Sho Yaida. 2019. Robust learning with jacobian regularization. arXiv preprint arXiv:1908.02729(2019)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Jin Huang Wayne\u00a0Xin Zhao Hongjian Dou Ji-Rong Wen and Edward\u00a0Y Chang. 2018. Improving sequential recommendation with knowledge-enhanced memory networks. In SIGIR. 505\u2013514.","DOI":"10.1145\/3209978.3210017"},{"key":"e_1_3_2_1_23_1","volume-title":"A stochastic estimator of the trace of the influence matrix for Laplacian smoothing splines. Communications in Statistics-Simulation and Computation","author":"Hutchinson F","year":"1989","unstructured":"Michael\u00a0F Hutchinson. 1989. A stochastic estimator of the trace of the influence matrix for Laplacian smoothing splines. Communications in Statistics-Simulation and Computation (1989)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Daniel Jakubovitz and Raja Giryes. 2018. Improving dnn robustness to adversarial attacks using jacobian regularization. In ECCV. 514\u2013529.","DOI":"10.1007\/978-3-030-01258-8_32"},{"key":"e_1_3_2_1_25_1","unstructured":"Eric Jang Shixiang Gu and Ben Poole. 2017. Categorical Reparameterization with Gumbel-Softmax. In ICLR."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Wang-Cheng Kang and Julian McAuley. 2018. Self-attentive sequential recommendation. In ICDM. 197\u2013206.","DOI":"10.1109\/ICDM.2018.00035"},{"key":"e_1_3_2_1_27_1","unstructured":"Angelos Katharopoulos Apoorv Vyas Nikolaos Pappas and Fran\u00e7ois Fleuret. 2020. Transformers are rnns: Fast autoregressive transformers with linear attention. In ICML. 5156\u20135165."},{"key":"e_1_3_2_1_28_1","unstructured":"Hyunjik Kim George Papamakarios and Andriy Mnih. 2021. The lipschitz constant of self-attention. In ICML. 5562\u20135571."},{"key":"e_1_3_2_1_29_1","volume-title":"Reformer: The Efficient Transformer. In ICLR.","author":"Kitaev Nikita","year":"2019","unstructured":"Nikita Kitaev, Lukasz Kaiser, and Anselm Levskaya. 2019. Reformer: The Efficient Transformer. In ICLR."},{"key":"e_1_3_2_1_30_1","unstructured":"Jiacheng Li Yujie Wang and Julian McAuley. 2020. Time interval aware self-attention for sequential recommendation. In WSDM. 322\u2013330."},{"key":"e_1_3_2_1_31_1","unstructured":"Yang Li Tong Chen Peng-Fei Zhang and Hongzhi Yin. 2021. Lightweight Self-Attentive Sequential Recommendation. In CIKM. 967\u2013977."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Defu Lian Yongji Wu Yong Ge Xing Xie and Enhong Chen. 2020. Geography-aware sequential location recommendation. In KDD. 2009\u20132019.","DOI":"10.1145\/3394486.3403252"},{"key":"e_1_3_2_1_33_1","unstructured":"Zhiwei Liu Ziwei Fan Yu Wang and Philip\u00a0S. Yu. 2021. Augmenting Sequential Recommendation with Pseudo-Prior Items via Reversely Pre-Training Transformer. In SIGIR. 1608\u20131612."},{"key":"e_1_3_2_1_34_1","unstructured":"Christos Louizos Max Welling and Diederik\u00a0P Kingma. 2019. Learning Sparse Neural Networks through Regularization. In ICLR."},{"key":"e_1_3_2_1_35_1","unstructured":"Jianxin Ma Chang Zhou Hongxia Yang Peng Cui Xin Wang and Wenwu Zhu. 2020. Disentangled self-supervision in sequential recommenders. In KDD. 483\u2013491."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Chaitanya Malaviya Pedro Ferreira and Andr\u00e9\u00a0FT Martins. 2018. Sparse and Constrained Attention for Neural Machine Translation. In ACL. 370\u2013376.","DOI":"10.18653\/v1\/P18-2059"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Raphael\u00a0A Meyer Cameron Musco Christopher Musco and David\u00a0P Woodruff. 2021. Hutch++: Optimal stochastic trace estimation. In SOSA. 142\u2013155.","DOI":"10.1137\/1.9781611976496.16"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Ben Peters Vlad Niculae and Andr\u00e9\u00a0FT Martins. 2019. Sparse Sequence-to-Sequence Models. In ACL. 1504\u20131519.","DOI":"10.18653\/v1\/P19-1146"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Steffen Rendle Christoph Freudenthaler and Lars Schmidt-Thieme. 2010. Factorizing personalized markov chains for next-basket recommendation. In WWW. 811\u2013820.","DOI":"10.1145\/1772690.1772773"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Sainbayar Sukhbaatar \u00c9douard Grave Piotr Bojanowski and Armand Joulin. 2019. Adaptive Attention Span in Transformers. In ACL. 331\u2013335.","DOI":"10.18653\/v1\/P19-1032"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Fei Sun Jun Liu Jian Wu Changhua Pei Xiao Lin Wenwu Ou and Peng Jiang. 2019. BERT4Rec: Sequential recommendation with bidirectional encoder representations from transformer. In CIKM. 1441\u20131450.","DOI":"10.1145\/3357384.3357895"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Jiaxi Tang and Ke Wang. 2018. Personalized top-n sequential recommendation via convolutional sequence embedding. In WSDM. 565\u2013573.","DOI":"10.1145\/3159652.3159656"},{"key":"e_1_3_2_1_43_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS. 5998\u20136008."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Jianling Wang Kaize Ding and James Caverlee. 2021. Sequential Recommendation for Cold-start Users with Meta Transitional Learning. In SIGIR. 1783\u20131787.","DOI":"10.1145\/3404835.3463089"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Wenjie Wang Fuli Feng Xiangnan He Liqiang Nie and Tat-Seng Chua. 2021. Denoising implicit feedback for recommendation. In WSDM. 373\u2013381.","DOI":"10.1145\/3437963.3441800"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Wenjie Wang Fuli Feng Xiangnan He Hanwang Zhang and Tat-Seng Chua. 2021. Clicks can be cheating: Counterfactual recommendation for mitigating clickbait issue. In SIGIR. 1288\u20131297.","DOI":"10.1145\/3404835.3462962"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Zhenlei Wang Jingsen Zhang Hongteng Xu Xu Chen Yongfeng Zhang Wayne\u00a0Xin Zhao and Ji-Rong Wen. 2021. Counterfactual data-augmented sequential recommendation. In SIGIR. 347\u2013356.","DOI":"10.1145\/3404835.3462855"},{"key":"e_1_3_2_1_48_1","volume":"199","author":"Ronald\u00a0","unstructured":"Ronald\u00a0J Williams. 1992. Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine learning 8, 3-4 (1992), 229\u2013256.","journal-title":"J Williams."},{"key":"e_1_3_2_1_49_1","unstructured":"Jibang Wu Renqin Cai and Hongning Wang. 2020. D\u00e9j\u00e0 vu: A contextualized temporal attention mechanism for sequential recommendation. In WWW. 2199\u20132209."},{"key":"e_1_3_2_1_50_1","unstructured":"Liwei Wu Shuqing Li Cho-Jui Hsieh and James Sharpnack. 2020. SSE-PT: Sequential recommendation via personalized transformer. In RecSys. 328\u2013337."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Shu Wu Yuyuan Tang Yanqiao Zhu Liang Wang Xing Xie and Tieniu Tan. 2019. Session-based recommendation with graph neural networks. In AAAI. 346\u2013353.","DOI":"10.1609\/aaai.v33i01.3301346"},{"key":"e_1_3_2_1_52_1","unstructured":"Zhen Wu Lijun Wu Qi Meng Yingce Xia Shufang Xie Tao Qin Xinyu Dai and Tie-Yan Liu. 2021. UniDrop: A Simple yet Effective Technique to Improve Transformer without Extra Cost. In NAACL-HLT. 3865\u20133878."},{"key":"e_1_3_2_1_53_1","unstructured":"Chengfeng Xu Pengpeng Zhao Yanchi Liu Jiajie Xu Victor S\u00a0Sheng S.\u00a0Sheng Zhiming Cui Xiaofang Zhou and Hui Xiong. 2019. Recurrent convolutional neural network for sequential recommendation. In WWW. 3398\u20133404."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"An Yan Shuo Cheng Wang-Cheng Kang Mengting Wan and Julian McAuley. 2019. CosRec: 2D convolutional neural networks for sequential recommendation. In CIKM. 2173\u20132176.","DOI":"10.1145\/3357384.3358113"},{"key":"e_1_3_2_1_55_1","unstructured":"Chin-Chia\u00a0Michael Yeh Mengting Gu Yan Zheng Huiyuan Chen Javid Ebrahimi Zhongfang Zhuang Junpeng Wang Liang Wang and Wei Zhang. 2022. Embedding Compression with Hashing for Efficient Representation Learning in Graph. In KDD."},{"key":"e_1_3_2_1_56_1","volume-title":"ARM: Augment-REINFORCE-merge gradient for stochastic binary networks. In ICLR.","author":"Yin Mingzhang","year":"2019","unstructured":"Mingzhang Yin and Mingyuan Zhou. 2019. ARM: Augment-REINFORCE-merge gradient for stochastic binary networks. In ICLR."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Fajie Yuan Alexandros Karatzoglou Ioannis Arapakis Joemon\u00a0M Jose and Xiangnan He. 2019. A simple convolutional generative network for next item recommendation. In WSDM. 582\u2013590.","DOI":"10.1145\/3289600.3290975"},{"key":"e_1_3_2_1_58_1","unstructured":"Manzil Zaheer Guru Guruganesh Kumar\u00a0Avinava Dubey Joshua Ainslie Chris Alberti Santiago Ontanon Philip Pham Anirudh Ravula Qifan Wang Li Yang 2020. Big bird: Transformers for longer sequences. In NeurIPS. 17283\u201317297."},{"key":"e_1_3_2_1_59_1","volume-title":"Informer: Beyond efficient transformer for long sequence time-series forecasting. In AAAI. 11106\u201311115.","author":"Zhou Haoyi","year":"2021","unstructured":"Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 2021. Informer: Beyond efficient transformer for long sequence time-series forecasting. In AAAI. 11106\u201311115."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"crossref","unstructured":"Wangchunshu Zhou Tao Ge Furu Wei Ming Zhou and Ke Xu. 2020. Scheduled DropHead: A Regularization Method for Transformer Models. In EMNLP. 1971\u20131980.","DOI":"10.18653\/v1\/2020.findings-emnlp.178"}],"event":{"name":"RecSys '22: Sixteenth ACM Conference on Recommender Systems","location":"Seattle WA USA","acronym":"RecSys '22","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGAI ACM Special Interest Group on Artificial Intelligence","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGIR ACM Special Interest Group on Information Retrieval","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 16th ACM Conference on Recommender Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3523227.3546788","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3523227.3546788","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:03:00Z","timestamp":1750186980000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3523227.3546788"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,9,13]]},"references-count":60,"alternative-id":["10.1145\/3523227.3546788","10.1145\/3523227"],"URL":"https:\/\/doi.org\/10.1145\/3523227.3546788","relation":{},"subject":[],"published":{"date-parts":[[2022,9,13]]},"assertion":[{"value":"2022-09-13","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}