{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T17:02:16Z","timestamp":1778605336728,"version":"3.51.4"},"reference-count":171,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T00:00:00Z","timestamp":1675209600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T00:00:00Z","timestamp":1675209600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T00:00:00Z","timestamp":1675209600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Onera Health and the Project"},{"DOI":"10.13039\/501100008530","name":"European Regional Development Fund","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100008530","id-type":"DOI","asserted-by":"publisher"}]},{"name":"ORTEC"},{"DOI":"10.13039\/501100022491","name":"Max Planck ETH Center for Learning Systems","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100022491","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Sustainable Chemical Processes through Catalysis"},{"name":"National Center of Competence in Research"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2023,2,1]]},"DOI":"10.1109\/tpami.2022.3157042","type":"journal-article","created":{"date-parts":[[2022,3,7]],"date-time":"2022-03-07T20:52:47Z","timestamp":1646686367000},"page":"1353-1371","source":"Crossref","is-referenced-by-count":64,"title":["A Review of the Gumbel-max Trick and its Extensions for Discrete Stochasticity in Machine Learning"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2629-3898","authenticated-orcid":false,"given":"Iris A. M.","family":"Huijben","sequence":"first","affiliation":[{"name":"Department of Electrical Engineering, Eindhoven University of Technology, Eindhoven, The Netherlands"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1837-1454","authenticated-orcid":false,"given":"Wouter","family":"Kool","sequence":"additional","affiliation":[{"name":"Amsterdam Machine Learning Lab (AMLab), University of Amsterdam, Amsterdam, The Netherlands"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Max B.","family":"Paulus","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH, Z&#x00FC;rich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2845-0495","authenticated-orcid":false,"given":"Ruud J. G.","family":"van Sloun","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering, Eindhoven University of Technology, Eindhoven, The Netherlands"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Auto-encoding variational bayes","author":"Kingma","year":"2014","journal-title":"Proc. Int. Conf. Learn. Representations"},{"key":"ref2","first-page":"1747","article-title":"Pixel recurrent neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Oord"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1093\/biomet\/57.1.97"},{"key":"ref4","volume-title":"Statistical Theory of Extreme Values and Some Practical Applications: A Series of Lectures","volume":"33","author":"Gumbel","year":"1954"},{"key":"ref5","article-title":"Categorical reparameterization with Gumbel-softmax","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Jang"},{"key":"ref6","article-title":"The concrete distribution: A continuous relaxation of discrete random variables","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Maddison"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992696"},{"issue":"2","key":"ref8","first-page":"115","article-title":"Les valeurs extr\u00eames des distributions statistiques","volume":"5","author":"Gumbel","year":"1935","journal-title":"Ann. L\u2019institut Henri Poincar\u00e9"},{"key":"ref9","first-page":"141","article-title":"La distribution de la plus grande de n valeurs","volume":"1","author":"Mises","year":"1936","journal-title":"Rev. Math. Union Interbalcanique"},{"key":"ref10","first-page":"5094","article-title":"Boltzmann exploration done right","volume-title":"Proc. 24th Int. Conf. Neural Informat. Process. Syst.","author":"Cesa-Bianchi"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3389768"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0241271"},{"key":"ref13","first-page":"3499","article-title":"Stochastic beams and where to find them: The Gumbel-Top-k trick for sampling sequences without replacement","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kool"},{"key":"ref14","first-page":"1","article-title":"Ancestral gumbel-top-k sampling for sampling without replacement","volume-title":"J. Mach. Learn. Res.","volume":"21","author":"Kool","year":"2020"},{"key":"ref15","first-page":"6309","article-title":"Neural discrete representation learning","volume-title":"Proc. 31st Int. Conf. Neural Informat. Process. Syst.","author":"Van Den Oord"},{"key":"ref16","article-title":"Estimating or propagating gradients through stochastic neurons for conditional computation","author":"Bengio","year":"2013"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2018.8489778"},{"key":"ref18","first-page":"1","article-title":"Towards hierarchical discrete variational autoencoders","volume-title":"Proc. Symp. Adv. Approx. Bayes. Infer.","author":"Li\u00e9vin"},{"key":"ref19","first-page":"8821","article-title":"Zero-shot text-to-image generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ramesh"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/339"},{"key":"ref21","first-page":"1","article-title":"Is simple better?: Revisiting simple generative models for unsupervised clustering","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Figueroa"},{"key":"ref22","first-page":"708","article-title":"Learning disentangled joint continuous and discrete representations","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Dupont"},{"key":"ref23","first-page":"1","article-title":"Semi-supervised learning using deep generative models and auxiliary tasks","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Figueroa"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN48605.2020.9207154"},{"key":"ref25","article-title":"Learning to screen for fast softmax inference on large vocabulary neural networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen"},{"key":"ref26","first-page":"4252","article-title":"Deep clustering with concrete K-means","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Gao"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12077"},{"key":"ref28","article-title":"Differentiable perturb-and-parse: Semi-supervised parsing with a structured variational autoencoder","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Corro","year":"2019"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-34518-1_10"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683480"},{"key":"ref31","article-title":"VQ-WAV2VEC: Self-supervised learning of discrete speech representations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Baevski"},{"key":"ref32","first-page":"13623","article-title":"Paraphrase generation with latent bag of words","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Fu"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3298689.3347068"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1093\/bib\/bbaa256"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.374"},{"key":"ref36","first-page":"1386","article-title":"Learning to explain: An information-theoretic perspective on model interpretation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen"},{"key":"ref37","first-page":"444","article-title":"Concrete autoencoders: Differentiable feature selection and reconstruction","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Abid"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.image.2017.11.005"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC.2019.8917213"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/s42979-020-00264-2"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3220086"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00184"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC47483.2019.9023327"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-47436-2_57"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58598-3_15"},{"key":"ref46","first-page":"1","article-title":"Greedy attack and gumbel attack: Generating adversarial examples for discrete data","volume":"21","author":"Yang","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00344"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3374135.3385309"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00684"},{"key":"ref50","first-page":"2146","article-title":"Emergence of language with multi-agent games: Learning to communicate with sequences of symbols","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Havrylov"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11492"},{"key":"ref52","first-page":"2672","article-title":"Generative adversarial nets","volume-title":"Proc. 27th Int. Conf. Neural Informat. Process. Syst.","author":"Goodfellow"},{"key":"ref53","article-title":"GANS for sequences of discrete elements with the gumbel-softmax distribution","author":"Kusner","year":"2016"},{"key":"ref54","first-page":"4135","article-title":"Semi-supervised image captioning via reconstruction","volume-title":"Proc. Int. Conf. Comput. Vis.","author":"Xu"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.445"},{"key":"ref56","first-page":"313","article-title":"Best of both worlds: Transferring knowledge from discriminative learning to a generative visual dialog model","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Lu"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12016"},{"key":"ref58","first-page":"1","article-title":"Relgan: Relational generative adversarial networks for text generation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Nie"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-srw.23"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2976491"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58526-6_46"},{"key":"ref62","first-page":"622","article-title":"End-to-end differentiable adversarial imitation Learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Baram"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2942592"},{"key":"ref64","first-page":"1","article-title":"Compressing word embeddings via deep compositional code learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Shu"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/JXCDC.2020.2992306"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.364"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12259"},{"key":"ref68","first-page":"2235","article-title":"Doc2Hash: Learning discrete latent variables for document retrieval","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguist. Hum. Lang. Technol.","author":"Zhang"},{"key":"ref69","first-page":"573","article-title":"Improving inference for neural image compression","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Yang"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511794308"},{"key":"ref71","first-page":"1","article-title":"Deep probabilistic subsampling for task-adaptive compressed sensing","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Huijben"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2020.3008501"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053331"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/WCSP49889.2020.9299795"},{"key":"ref75","first-page":"10509","article-title":"Active deep probabilistic subsampling","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Gorp"},{"key":"ref76","article-title":"Relaxed quantization for discretized neural networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Louizos"},{"key":"ref77","article-title":"Mixed precision of convnets via differentiable architecture search","author":"Wu","year":"2018"},{"key":"ref78","first-page":"663","article-title":"Stochastic layer-wise precision in deep neural networks","volume-title":"Proc. Conf. Uncertainty Artif. Intell.","author":"Lacey"},{"key":"ref79","first-page":"3584","article-title":"Concrete dropout","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Gal"},{"key":"ref80","first-page":"876","article-title":"DATA: Differentiable architecture approximation","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Chang"},{"key":"ref81","article-title":"SNAS: Stochastic neural architecture search","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Xie"},{"key":"ref82","article-title":"Learning sparse neural networks through L0 regularization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Louizos"},{"key":"ref83","article-title":"Gradient-based optimization of neural network architecture","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Grathwohl"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01099"},{"key":"ref85","first-page":"1","article-title":"Network pruning via transformable architecture search","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Dong"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58583-9_15"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01298"},{"key":"ref88","first-page":"5122","article-title":"Operation-aware soft channel pruning using differentiable masks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kang"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3176809"},{"key":"ref90","first-page":"3001","article-title":"Towards binary-valued gates for robust LSTM training","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683060"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01190-4"},{"key":"ref93","article-title":"Batch-shaping for learning conditional channel gated networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Bejnordi"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.6098"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00239"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00363"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00477"},{"key":"ref98","article-title":"Interpreting graph neural networks for NLP with differentiable edge masking","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Schlichtkrull"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00147"},{"key":"ref100","first-page":"3854","article-title":"Learning to branch for multi-task learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Guo"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/s41109-019-0194-4"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P15-1150"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11975"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-2902"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2020.102811"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1016\/0022-2496(77)90026-8"},{"key":"ref107","first-page":"3086","article-title":"A* sampling","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Maddison"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipl.2005.11.003"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1093\/oso\/9780198536932.001.0001"},{"key":"ref110","article-title":"Gumbel-max trick and weighted reservoir sampling","author":"Vieira","year":"2014"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/544"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/10761.003.0008"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-141-3.50030-4"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330733"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1082"},{"key":"ref116","article-title":"The curious case of neural text degeneration","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Holtzman"},{"key":"ref117","first-page":"8785","article-title":"Incremental sampling without replacement for sequence models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Shi"},{"key":"ref118","first-page":"10 260","article-title":"Predictive sampling with forecasting autoregressive models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wiggers"},{"key":"ref119","first-page":"2627","article-title":"REBAR: Low-variance, unbiased gradient estimates for discrete latent variable models","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Tucker"},{"key":"ref120","article-title":"Backpropagation through the void: Optimizing control variates for black-box gradient estimation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Grathwohl"},{"key":"ref121","article-title":"Rao-blackwellizing the straight-through gumbel-softmax gradient estimator","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Paulus"},{"key":"ref122","first-page":"4881","article-title":"Counterfactual off-policy evaluation with gumbel-max structural causal models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Oberst"},{"key":"ref123","article-title":"Argmax flows and multinomial diffusion: Towards non-autoregressive language models","author":"Hoogeboom","year":"2021"},{"key":"ref124","article-title":"Gumbel machinery","author":"Maddison","year":"2017"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380160"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.2307\/2346567"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1037\/14396-000"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.21236\/AD0426243"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.2307\/3212535"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1080\/03610910701790269"},{"key":"ref131","first-page":"1","article-title":"Estimating gradients for discrete random variables by sampling without replacement","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kool"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1037\/h0070288"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1080\/01621459.1956.10501326"},{"key":"ref134","first-page":"379","article-title":"Ordered and unordered estimators in sampling without replacement","volume":"18","author":"Murthy","year":"1957","journal-title":"Sankhy\u0101, Indian J. Statist."},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1145\/1314690.1314696"},{"key":"ref136","article-title":"Estimating means in a finite universe","author":"Vieira","year":"2017"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126242"},{"key":"ref138","first-page":"1221","article-title":"Randomized optimum models for structured prediction","volume-title":"Proc. Int. Conf. Artif. Intell. Statst","author":"Tarlow"},{"key":"ref139","first-page":"1","article-title":"On sampling from the Gibbs distribution with random maximum a-posteriori perturbations","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Hazan"},{"key":"ref140","first-page":"3691","article-title":"Scalable discrete sampling as a multi-armed bandit problem","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.10421"},{"key":"ref142","first-page":"1278","article-title":"Stochastic backpropagation and approximate inference in deep generative models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Rezende"},{"key":"ref143","first-page":"3814","article-title":"Learning and inference via maximum inner product search","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"6","author":"Mussmann"},{"key":"ref144","article-title":"Fast amortizedinference and learning in log-linear models with randomly perturbed nearest neighbor search,","volume-title":"Conf. Uncertain. Artif. Intell.","author":"Mussmann","year":"2017"},{"key":"ref145","article-title":"Learning latent permutations with gumbel-sinkhorn networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Mena"},{"key":"ref146","first-page":"1","article-title":"Concrete MAP detection: A machine learning inspired relaxation","volume-title":"Proc. IEEE Int. ITG Workshop Smart Antennas","author":"Beck"},{"key":"ref147","first-page":"879","article-title":"Gumbel-softmax-based Optimization: A simple general framework for optimization problems on graphs","volume-title":"Proc. Int. Conf. Complex Netw. Their Appl.","author":"Li"},{"key":"ref148","first-page":"1","article-title":"Monte carlo gradient estimation in machine learning","volume":"21","author":"Mohamed","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1145\/84537.84552"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992696"},{"key":"ref151","first-page":"5244","article-title":"Evaluating the variance of likelihood-ratio gradient estimators","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"7","author":"Tokui"},{"key":"ref152","first-page":"1791","article-title":"Neural variational inference and learning in belief networks","volume":"32","author":"Mnih","journal-title":"Proc. Int. Conf. Mach. Learn."},{"key":"ref153","first-page":"2188","article-title":"Variational inference for monte carlo objectives","volume":"48","author":"Mnih","journal-title":"Proc. Int. Conf. Mach. Learn."},{"key":"ref154","first-page":"1","article-title":"Buy 4 reinforce samples, get a baseline for free!","volume-title":"Proc. Deep Reinforcement Learn. Meets Structured Prediction Workshop","author":"Kool"},{"key":"ref155","first-page":"7095","article-title":"ARSM: Augment-REINFORCE-swap-merge estimator for gradient backpropagation through categorical variables","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Yin"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1214\/13-BA858"},{"key":"ref157","first-page":"5691","article-title":"Gradient estimation with stochastic softmax tricks","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Paulus"},{"key":"ref158","first-page":"12311","article-title":"Invertible gaussian reparameterization: Revisiting the gumbel-softmax","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Potapczynski"},{"key":"ref159","first-page":"1618","article-title":"Reparameterizing the birkhoff polytope for variational permutation inference","volume-title":"Proc. 25th Int. Conf. Artif. Intell. Statist.","author":"Linderman"},{"key":"ref160","article-title":"Stochastic optimization of sorting networks via continuous relaxations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Grover"},{"key":"ref161","article-title":"TensorFlow: Large-scale machine learning on heterogeneous systems","author":"Abadi","year":"2015"},{"key":"ref162","first-page":"8024","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"Paszke","year":"2019","journal-title":"Proc. Conf. Neural Informat. Process. Syst."},{"key":"ref163","first-page":"1727","article-title":"Which evaluations uncover sense representations that actually make sense?","volume-title":"Proc. Lang. Resour. Eval. Conf.","author":"Guo"},{"key":"ref164","doi-asserted-by":"crossref","DOI":"10.1016\/j.ddtec.2020.11.009","article-title":"A compact review of molecular property prediction with graph neural networks","volume-title":"Drug Discovery Today: Technologies","author":"Wieder","year":"2020"},{"key":"ref165","first-page":"179","article-title":"OptNet: Differentiable optimization as a layer in neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Amos"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3059462"},{"key":"ref167","first-page":"1014","article-title":"Differentiable learning of submodular models","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Djolonga"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/379"},{"key":"ref169","first-page":"3799","article-title":"SparseMAP: Differentiable sparse structured inference","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Niculae"},{"key":"ref170","first-page":"6203","article-title":"Direct optimization through arg max for discrete variational auto-encoder","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Lorberbom"},{"key":"ref171","first-page":"9508","article-title":"Learning with differentiable perturbed optimizers","volume-title":"Proc. Conf. Neural Informat. Process. Syst.","author":"Berthet"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10008914\/09729603.pdf?arnumber=9729603","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,17]],"date-time":"2024-01-17T23:38:04Z","timestamp":1705534684000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9729603\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,1]]},"references-count":171,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2022.3157042","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,2,1]]}}}