{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:10:49Z","timestamp":1750219849677,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,8,9]],"date-time":"2023-08-09T00:00:00Z","timestamp":1691539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,8,9]]},"DOI":"10.1145\/3578337.3605133","type":"proceedings-article","created":{"date-parts":[[2023,8,9]],"date-time":"2023-08-09T22:12:46Z","timestamp":1691619166000},"page":"189-198","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Learning Aligned Cross-Modal and Cross-Product Embeddings for Generating the Topics of Shopping Needs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-5234-9038","authenticated-orcid":false,"given":"Yi-Ru","family":"Tsai","sequence":"first","affiliation":[{"name":"Graduate Institute of Networking and Multimedia, Taipei, Taiwan Roc"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5892-0385","authenticated-orcid":false,"given":"Pu-Jen","family":"Cheng","sequence":"additional","affiliation":[{"name":"Graduate Institute of Networking and Multimedia, Taipei, Taiwan Roc"}]}],"member":"320","published-online":{"date-parts":[[2023,8,9]]},"reference":[{"key":"e_1_3_2_2_1_1","first-page":"24206","article-title":"Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text","volume":"34","author":"Akbari Hassan","year":"2021","unstructured":"Hassan Akbari, Liangzhe Yuan, Rui Qian, Wei-Hong Chuang, Shih-Fu Chang, Yin Cui, and Boqing Gong. 2021. Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text. Advances in Neural Information Processing Systems 34 (2021), 24206--24221.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330725"},{"key":"e_1_3_2_2_3_1","volume-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555","author":"Chung Junyoung","year":"2014","unstructured":"Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)."},{"key":"e_1_3_2_2_4_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_5_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019460"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","unstructured":"Maarten Grootendorst. 2020. KeyBERT: Minimal keyword extraction with BERT. https:\/\/doi.org\/10.5281\/zenodo.4461265","DOI":"10.5281\/zenodo.4461265"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401063"},{"key":"e_1_3_2_2_10_1","volume-title":"Long short-term memory. Neural computation 9, 8","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_2_11_1","volume-title":"et al","author":"Li Chenliang","year":"2022","unstructured":"Chenliang Li, Haiyang Xu, Junfeng Tian, Wei Wang, Ming Yan, Bin Bi, Jiabo Ye, Hehong Chen, Guohai Xu, Zheng Cao, et al . 2022. mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections. arXiv preprint arXiv:2205.12005 (2022)."},{"key":"e_1_3_2_2_12_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_2_13_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34 (2021), 9694--9705."},{"key":"e_1_3_2_2_14_1","volume-title":"Proceedings, Part XXX 16","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","unstructured":"Ziyang Luo Yadong Xi Rongsheng Zhang and Jing Ma. 2022. A Frustratingly Simple Approach for End-to-End Image Captioning. https:\/\/doi.org\/10.48550\/ARXIV.2201.12723","DOI":"10.48550\/ARXIV.2201.12723"},{"key":"e_1_3_2_2_16_1","volume-title":"BERTGEN: Multi-task Generation through BERT. arXiv preprint arXiv:2106.03484","author":"Mitzalis Faidon","year":"2021","unstructured":"Faidon Mitzalis, Ozan Caglayan, Pranava Madhyastha, and Lucia Specia. 2021. BERTGEN: Multi-task Generation through BERT. arXiv preprint arXiv:2106.03484 (2021)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1018"},{"key":"e_1_3_2_2_18_1","volume-title":"An introduction to convolutional neural networks. arXiv preprint arXiv:1511.08458","author":"O'Shea Keiron","year":"2015","unstructured":"Keiron O'Shea and Ryan Nash. 2015. An introduction to convolutional neural networks. arXiv preprint arXiv:1511.08458 (2015)."},{"key":"e_1_3_2_2_19_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_20_1","volume-title":"BPR: Bayesian personalized ranking from implicit feedback. arXiv preprint arXiv:1205.2618","author":"Rendle Steffen","year":"2012","unstructured":"Steffen Rendle, Christoph Freudenthaler, Zeno Gantner, and Lars Schmidt-Thieme. 2012. BPR: Bayesian personalized ranking from implicit feedback. arXiv preprint arXiv:1205.2618 (2012)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.physd.2019.132306"},{"key":"e_1_3_2_2_22_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_2_23_1","volume-title":"International conference on machine learning. PMLR, 10347--10357","author":"Touvron Hugo","year":"2021","unstructured":"Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, and Herv\u00e9 J\u00e9gou. 2021. Training data-efficient image transformers & distillation through attention. In International conference on machine learning. PMLR, 10347--10357."},{"key":"e_1_3_2_2_24_1","volume-title":"International conference on machine learning. PMLR, 10347--10357","author":"Touvron Hugo","year":"2021","unstructured":"Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, and Herv\u00e9 J\u00e9gou. 2021. Training data-efficient image transformers & distillation through attention. In International conference on machine learning. PMLR, 10347--10357."},{"key":"e_1_3_2_2_25_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"crossref","unstructured":"Thomas Wolf Lysandre Debut Victor Sanh Julien Chaumond Clement Delangue Anthony Moi Pierric Cistac Tim Rault R\u00e9mi Louf Morgan Funtowicz et al. 2019. Huggingface's transformers: State-of-the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019).","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_2_27_1","volume-title":"Multi-modal generative adversarial network for short product title generation in mobile e-commerce. arXiv preprint arXiv:1904.01735","author":"Zhang Jian-Guo","year":"2019","unstructured":"Jian-Guo Zhang, Pengcheng Zou, Zhao Li, Yao Wan, Xiuming Pan, Yu Gong, and Philip S Yu. 2019. Multi-modal generative adversarial network for short product title generation in mobile e-commerce. arXiv preprint arXiv:1904.01735 (2019)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3411954"}],"event":{"name":"ICTIR '23: The 2023 ACM SIGIR International Conference on the Theory of Information Retrieval","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Taipei Taiwan","acronym":"ICTIR '23"},"container-title":["Proceedings of the 2023 ACM SIGIR International Conference on Theory of Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3578337.3605133","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3578337.3605133","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:47:04Z","timestamp":1750178824000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3578337.3605133"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,9]]},"references-count":28,"alternative-id":["10.1145\/3578337.3605133","10.1145\/3578337"],"URL":"https:\/\/doi.org\/10.1145\/3578337.3605133","relation":{},"subject":[],"published":{"date-parts":[[2023,8,9]]},"assertion":[{"value":"2023-08-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}