{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T02:54:43Z","timestamp":1764557683934,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,4,30]],"date-time":"2023-04-30T00:00:00Z","timestamp":1682812800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,4,30]]},"DOI":"10.1145\/3543873.3584655","type":"proceedings-article","created":{"date-parts":[[2023,4,28]],"date-time":"2023-04-28T11:36:14Z","timestamp":1682681774000},"page":"500-507","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Visual Item Selection With Voice Assistants"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9453-9933","authenticated-orcid":false,"given":"Prashan","family":"Wanigasekara","sequence":"first","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8141-2629","authenticated-orcid":false,"given":"Rafid","family":"Al-Humaimidi","sequence":"additional","affiliation":[{"name":"Amazon, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2233-4568","authenticated-orcid":false,"given":"Turan","family":"Gojayev","sequence":"additional","affiliation":[{"name":"Amazon, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8219-507X","authenticated-orcid":false,"given":"Niloofar","family":"Gheissari","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1948-5629","authenticated-orcid":false,"given":"Achal","family":"Dave","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8716-2528","authenticated-orcid":false,"given":"Stephen","family":"Rawls","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5924-6935","authenticated-orcid":false,"given":"Fan","family":"Yang","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3169-575X","authenticated-orcid":false,"given":"Kechen","family":"Qin","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8294-1514","authenticated-orcid":false,"given":"Nalin","family":"Gupta","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2034-6207","authenticated-orcid":false,"given":"Spurthi","family":"Sandiri","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4790-5328","authenticated-orcid":false,"given":"Chevanthie","family":"Dissanayake","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7153-1963","authenticated-orcid":false,"given":"Zeynab","family":"Raeesy","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3064-6227","authenticated-orcid":false,"given":"Emre","family":"Barut","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1492-723X","authenticated-orcid":false,"given":"Chengwei","family":"Su","sequence":"additional","affiliation":[{"name":"Amazon, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,4,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Flamingo: a visual language model for few-shot learning. arXiv preprint arXiv:2204.14198","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katie Millican, Malcolm Reynolds, 2022. Flamingo: a visual language model for few-shot learning. arXiv preprint arXiv:2204.14198 (2022)."},{"key":"e_1_3_2_1_2_1","unstructured":"Amazon. 2022. Alexa Presentation Language. https:\/\/developer.amazon.com\/en-US\/docs\/alexa\/alexa-presentation-language\/add-visuals-and-audio-to-your-skill.html"},{"key":"e_1_3_2_1_3_1","unstructured":"Amazon. 2022. Amazon Mechanical Turk. https:\/\/www.mturk.com\/"},{"key":"e_1_3_2_1_4_1","unstructured":"Amazon. 2022. Amazon Skills Kit. https:\/\/developer.amazon.com\/en-US\/alexa\/alexa-skills-kit"},{"key":"e_1_3_2_1_5_1","unstructured":"Amazon. 2022. OpenAI. https:\/\/openai.com\/"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00543"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","unstructured":"Soravit Changpinyo Jordi Pont-Tuset Vittorio Ferrari and Radu Soricut. 2021. Telling the What while Pointing to the Where: Multimodal Queries for Image Retrieval. https:\/\/doi.org\/10.48550\/ARXIV.2102.04980","DOI":"10.48550\/ARXIV.2102.04980"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","unstructured":"Tongfei Chen Chetan Naik Hua He Pushpendre Rastogi and Lambert Mathias. 2019. Improving Long Distance Slot Carryover in Spoken Dialogue Systems. https:\/\/doi.org\/10.48550\/ARXIV.1906.01149","DOI":"10.48550\/ARXIV.1906.01149"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00307"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.202"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00304"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01371"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240541"},{"key":"e_1_3_2_1_14_1","volume-title":"Extending CLIP for Category-to-image Retrieval in E-commerce. In European Conference on Information Retrieval. Springer, 289\u2013303","author":"Hendriksen Mariya","year":"2022","unstructured":"Mariya Hendriksen, Maurits Bleeker, Svitlana Vakulenko, Nanne\u00a0van Noord, Ernst Kuiper, and Maarten\u00a0de Rijke. 2022. Extending CLIP for Category-to-image Retrieval in E-commerce. In European Conference on Information Retrieval. Springer, 289\u2013303."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","unstructured":"Mariya Hendriksen Maurits Bleeker Svitlana Vakulenko Nanne van Noord Ernst Kuiper and Maarten de Rijke. 2021. Extending CLIP for Category-to-image Retrieval in E-commerce. https:\/\/doi.org\/10.48550\/ARXIV.2112.11294","DOI":"10.48550\/ARXIV.2112.11294"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","unstructured":"Gabriel Ilharco Mitchell Wortsman Ross Wightman Cade Gordon Nicholas Carlini Rohan Taori Achal Dave Vaishaal Shankar Hongseok Namkoong John Miller Hannaneh Hajishirzi Ali Farhadi and Ludwig Schmidt. 2021. OpenCLIP. https:\/\/doi.org\/10.5281\/zenodo.5143773 If you use this software please cite it as below..","DOI":"10.5281\/zenodo.5143773"},{"key":"e_1_3_2_1_17_1","volume-title":"MDETR - Modulated Detection for End-to-End Multi-Modal Understanding. CoRR abs\/2104.12763","author":"Kamath Aishwarya","year":"2021","unstructured":"Aishwarya Kamath, Mannat Singh, Yann LeCun, Ishan Misra, Gabriel Synnaeve, and Nicolas Carion. 2021. MDETR - Modulated Detection for End-to-End Multi-Modal Understanding. CoRR abs\/2104.12763 (2021). arXiv:2104.12763https:\/\/arxiv.org\/abs\/2104.12763"},{"key":"e_1_3_2_1_18_1","volume-title":"Transformers in vision: A survey. ACM computing surveys (CSUR) 54, 10s","author":"Khan Salman","year":"2022","unstructured":"Salman Khan, Muzammal Naseer, Munawar Hayat, Syed\u00a0Waqas Zamir, Fahad\u00a0Shahbaz Khan, and Mubarak Shah. 2022. Transformers in vision: A survey. ACM computing surveys (CSUR) 54, 10s (2022), 1\u201341."},{"key":"e_1_3_2_1_19_1","volume-title":"A Task-oriented Dialog Dataset for Immersive Multimodal Conversations. CoRR abs\/2104.08667","author":"Kottur Satwik","year":"2021","unstructured":"Satwik Kottur, Seungwhan Moon, Alborz Geramifard, and Babak Damavandi. 2021. SIMMC 2.0: A Task-oriented Dialog Dataset for Immersive Multimodal Conversations. CoRR abs\/2104.08667 (2021). arXiv:2104.08667https:\/\/arxiv.org\/abs\/2104.08667"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","unstructured":"Anjishnu Kumar Arpit Gupta Julian Chan Sam Tucker Bjorn Hoffmeister Markus Dreyer Stanislav Peshterliev Ankur Gandhe Denis Filiminov Ariya Rastrow Christian Monson and Agnika Kumar. 2017. Just ASK: Building an Architecture for Extensible Self-Service Spoken Language Understanding. https:\/\/doi.org\/10.48550\/ARXIV.1711.00549","DOI":"10.48550\/ARXIV.1711.00549"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3159652.3159716"},{"key":"e_1_3_2_1_22_1","volume-title":"Multimodal dialogue state tracking. arXiv preprint arXiv:2206.07898","author":"Le Hung","year":"2022","unstructured":"Hung Le, Nancy\u00a0F Chen, and Steven\u00a0CH Hoi. 2022. Multimodal dialogue state tracking. arXiv preprint arXiv:2206.07898 (2022)."},{"key":"e_1_3_2_1_23_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. arXiv preprint arXiv:2201.12086","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. arXiv preprint arXiv:2201.12086 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"\u00a0H. Hoi","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath\u00a0R. Selvaraju, Akhilesh\u00a0Deepak Gotmare, Shafiq\u00a0R. Joty, Caiming Xiong, and Steven C.\u00a0H. Hoi. 2021. Align before Fuse: Vision and Language Representation Learning with Momentum Distillation. CoRR abs\/2107.07651 (2021). arXiv:2107.07651https:\/\/arxiv.org\/abs\/2107.07651"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.124"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled Weight Decay Regularization. https:\/\/doi.org\/10.48550\/ARXIV.1711.05101","DOI":"10.48550\/ARXIV.1711.05101"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","unstructured":"Corey Lynch Kamelia Aryafar and Josh Attenberg. 2015. Images Don\u2019t Lie: Transferring Deep Visual Semantic Features to Large-Scale Multimodal Learning to Rank. https:\/\/doi.org\/10.48550\/ARXIV.1511.06746","DOI":"10.48550\/ARXIV.1511.06746"},{"key":"e_1_3_2_1_29_1","volume-title":"Situated and Interactive Multimodal Conversations. CoRR abs\/2006.01460","author":"Moon Seungwhan","year":"2020","unstructured":"Seungwhan Moon, Satwik Kottur, Paul\u00a0A. Crook, Ankita De, Shivani Poddar, Theodore Levin, David Whitney, Daniel Difranco, Ahmad Beirami, Eunjoon Cho, Rajen Subba, and Alborz Geramifard. 2020. Situated and Interactive Multimodal Conversations. CoRR abs\/2006.01460 (2020). arXiv:2006.01460https:\/\/arxiv.org\/abs\/2006.01460"},{"key":"e_1_3_2_1_30_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. CoRR abs\/2103.00020","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. CoRR abs\/2103.00020 (2021). arXiv:2103.00020https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2017.8296311"},{"key":"e_1_3_2_1_32_1","unstructured":"Amazon\u00a0Web Services. 2022. Amazon SageMaker. https:\/\/aws.amazon.com\/sagemaker\/"},{"key":"e_1_3_2_1_33_1","unstructured":"Amazon\u00a0Web Services. 2022. AWS Lambda. https:\/\/aws.amazon.com\/lambda\/"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","unstructured":"Aman Shrivastava Ramprasaath\u00a0R. Selvaraju Nikhil Naik and Vicente Ordonez. 2021. CLIP-Lite: Information Efficient Visual Representation Learning from Textual Annotations. https:\/\/doi.org\/10.48550\/ARXIV.2112.07133","DOI":"10.48550\/ARXIV.2112.07133"},{"key":"e_1_3_2_1_35_1","volume-title":"Contrastive Multiview Coding. CoRR abs\/1906.05849","author":"Tian Yonglong","year":"2019","unstructured":"Yonglong Tian, Dilip Krishnan, and Phillip Isola. 2019. Contrastive Multiview Coding. CoRR abs\/1906.05849 (2019). arXiv:1906.05849http:\/\/arxiv.org\/abs\/1906.05849"},{"key":"e_1_3_2_1_36_1","volume-title":"Representation Learning with Contrastive Predictive Coding. CoRR abs\/1807.03748","author":"van\u00a0den Oord A\u00e4ron","year":"2018","unstructured":"A\u00e4ron van\u00a0den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. CoRR abs\/1807.03748 (2018). arXiv:1807.03748http:\/\/arxiv.org\/abs\/1807.03748"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","unstructured":"Tan Wang Xing Xu Yang Yang Alan Hanjalic Heng\u00a0Tao Shen and Jingkuan Song. 2019. Matching Images and Text with Multi-modal Tensor Fusion and Re-ranking. https:\/\/doi.org\/10.48550\/ARXIV.1908.04011","DOI":"10.48550\/ARXIV.1908.04011"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track. Association for Computational Linguistics, Abu Dhabi, UAE, 417\u2013428","author":"Wanigasekara Prashan","year":"2022","unstructured":"Prashan Wanigasekara, Nalin Gupta, Fan Yang, Emre Barut, Zeynab Raeesy, Kechen Qin, Stephen Rawls, Xinyue Liu, Chengwei Su, and Spurthi Sandiri. 2022. Multimodal Context Carryover. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: Industry Track. Association for Computational Linguistics, Abu Dhabi, UAE, 417\u2013428. https:\/\/aclanthology.org\/2022.emnlp-industry.42"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN55064.2022.9892420"},{"key":"e_1_3_2_1_40_1","volume-title":"Contrastive Representation Learning. lilianweng.github.io (May","author":"Weng Lilian","year":"2021","unstructured":"Lilian Weng. 2021. Contrastive Representation Learning. lilianweng.github.io (May 2021). https:\/\/lilianweng.github.io\/posts\/2021-05-31-contrastive\/"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"e_1_3_2_1_42_1","unstructured":"Wenyi Wu Karim Bouyarmane and Ismail Tutar. 2022. Catalog Phrase Grounding (CPG): Grounding of product textual attributes in product images for e-commerce vision-language applications. (2022)."},{"key":"e_1_3_2_1_43_1","volume-title":"Multimodal learning with transformers: a survey. arXiv preprint arXiv:2206.06488","author":"Xu Peng","year":"2022","unstructured":"Peng Xu, Xiatian Zhu, and David\u00a0A Clifton. 2022. Multimodal learning with transformers: a survey. arXiv preprint arXiv:2206.06488 (2022)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","unstructured":"Jiahui Yu Zirui Wang Vijay Vasudevan Legg Yeung Mojtaba Seyedhosseini and Yonghui Wu. 2022. CoCa: Contrastive Captioners are Image-Text Foundation Models. https:\/\/doi.org\/10.48550\/ARXIV.2205.01917","DOI":"10.48550\/ARXIV.2205.01917"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539151"}],"event":{"name":"WWW '23: The ACM Web Conference 2023","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Austin TX USA","acronym":"WWW '23"},"container-title":["Companion Proceedings of the ACM Web Conference 2023"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3543873.3584655","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3543873.3584655","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:33:52Z","timestamp":1755819232000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3543873.3584655"}},"subtitle":["A systems perspective"],"short-title":[],"issued":{"date-parts":[[2023,4,30]]},"references-count":45,"alternative-id":["10.1145\/3543873.3584655","10.1145\/3543873"],"URL":"https:\/\/doi.org\/10.1145\/3543873.3584655","relation":{},"subject":[],"published":{"date-parts":[[2023,4,30]]},"assertion":[{"value":"2023-04-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}