{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:52:50Z","timestamp":1777614770410,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,8,14]],"date-time":"2022-08-14T00:00:00Z","timestamp":1660435200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,8,14]]},"DOI":"10.1145\/3534678.3539149","type":"proceedings-article","created":{"date-parts":[[2022,8,12]],"date-time":"2022-08-12T19:06:12Z","timestamp":1660331172000},"page":"2851-2859","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Automatic Generation of Product-Image Sequence in E-commerce"],"prefix":"10.1145","author":[{"given":"Xiaochuan","family":"Fan","sequence":"first","affiliation":[{"name":"JD.COM Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chi","family":"Zhang","sequence":"additional","affiliation":[{"name":"JD.COM Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yong","family":"Yang","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, UNK, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yue","family":"Shang","sequence":"additional","affiliation":[{"name":"JD.COM Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xueying","family":"Zhang","sequence":"additional","affiliation":[{"name":"JD.COM Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhen","family":"He","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, UNK, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yun","family":"Xiao","sequence":"additional","affiliation":[{"name":"JD.COM Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Long","sequence":"additional","affiliation":[{"name":"JD.COM, Beijing, UNK, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lingfei","family":"Wu","sequence":"additional","affiliation":[{"name":"JD.COM Research, Mountain View, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,8,14]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2018.8622259"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10660-013-9104-5"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/11744078_23"},{"key":"e_1_3_2_1_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2556195.2556226"},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_9_1","volume-title":"Image matters: Detecting offensive and non-compliant content\/ logo in product images. arXiv preprint arXiv:1905.02234","author":"Gandhi Shreyansh","year":"2019","unstructured":"Shreyansh Gandhi, Samrat Kokkula, Abon Chaudhuri, Alessandro Magnani, Theban Stanley, Behzad Ahmadi, Venkatesh Kandaswamy, Omer Ovenc, and Shie Mannor. 2019. Image matters: Detecting offensive and non-compliant content\/ logo in product images. arXiv preprint arXiv:1905.02234 (2019)."},{"key":"e_1_3_2_1_10_1","unstructured":"Xiaojie Guo Shugen Wang Hanqing Zhao Shiliang Diao Jiajia Chen Zhuoye Ding Zhen He Yun Xiao Bo Long Han Yu et al. 2021. Intelligent Online Selling Point Extraction for E-Commerce Recommendation. arXiv preprint arXiv:2112.10613 (2021)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2016.7532767"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/1631272.1631361"},{"key":"e_1_3_2_1_15_1","volume-title":"International journal of computer vision 46, 1","author":"Jones Michael J","year":"2002","unstructured":"Michael J Jones and James M Rehg. 2002. Statistical color models with application to skin detection. International journal of computer vision 46, 1 (2002), 81--96."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.224"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR.2006.303","volume-title":"2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'06)","volume":"1","author":"Ke Yan","year":"2006","unstructured":"Yan Ke, Xiaoou Tang, and Feng Jing. 2006. The design of high-level features for photo quality assessment. In 2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'06), Vol. 1. IEEE, 419--426."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_29"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3333612"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_21_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_22_1","volume-title":"Cptr: Full transformer network for image captioning. arXiv preprint arXiv:2101.10804","author":"Liu Wei","year":"2021","unstructured":"Wei Liu, Sihan Chen, Longteng Guo, Xinxin Zhu, and Jing Liu. 2021. Cptr: Full transformer network for image captioning. arXiv preprint arXiv:2101.10804 (2021)."},{"key":"e_1_3_2_1_23_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_24_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. arXiv preprint arXiv:1908.02265","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. arXiv preprint arXiv:1908.02265 (2019)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2015.7350927"},{"key":"e_1_3_2_1_26_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. arXiv preprint arXiv:2103.00020 (2021)."},{"key":"e_1_3_2_1_27_1","unstructured":"Alec Radford JeffreyWu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_2_1_28_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2461466.2461486"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.6026"},{"key":"e_1_3_2_1_31_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_1_32_1","volume-title":"Theo Gevers, and Arnold WM Smeulders.","author":"Uijlings Jasper RR","year":"2013","unstructured":"Jasper RR Uijlings, Koen EA Van De Sande, Theo Gevers, and Arnold WM Smeulders. 2013. Selective search for object recognition. International journal of computer vision 104, 2 (2013), 154--171."},{"key":"e_1_3_2_1_33_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998--6008."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_35_1","volume-title":"CLUECorpus2020: A large-scale Chinese corpus for pre-training language model. arXiv preprint arXiv:2003.01355","author":"Xu Liang","year":"2020","unstructured":"Liang Xu, Xuanwei Zhang, and Qianqian Dong. 2020. CLUECorpus2020: A large-scale Chinese corpus for pre-training language model. arXiv preprint arXiv:2003.01355 (2020)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/DMDCM.2011.36"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178876.3186146"},{"key":"e_1_3_2_1_38_1","volume-title":"Item popularity prediction in e-commerce using image quality feature vectors. arXiv preprint arXiv:1605.03663","author":"Zakrewsky Stephen","year":"2016","unstructured":"Stephen Zakrewsky, Kamelia Aryafar, and Ali Shokoufandeh. 2016. Item popularity prediction in e-commerce using image quality feature vectors. arXiv preprint arXiv:1605.03663 (2016)."},{"key":"e_1_3_2_1_39_1","unstructured":"Xueying Zhang Yanyan Zou Hainan Zhang Jing Zhou Shiliang Diao Jiajia Chen Zhuoye Ding Zhen He Xueqi He Yun Xiao et al. 2021. Automatic Product Copywriting for E-Commerce. arXiv preprint arXiv:2112.11915 (2021)."},{"key":"e_1_3_2_1_40_1","volume-title":"European conference on computer vision. Springer, 391--405","author":"Lawrence Zitnick C","year":"2014","unstructured":"C Lawrence Zitnick and Piotr Doll\u00e1r. 2014. Edge boxes: Locating object proposals from edges. In European conference on computer vision. Springer, 391--405."}],"event":{"name":"KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Washington DC USA","acronym":"KDD '22","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3534678.3539149","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3534678.3539149","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:58Z","timestamp":1750186978000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3534678.3539149"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,14]]},"references-count":40,"alternative-id":["10.1145\/3534678.3539149","10.1145\/3534678"],"URL":"https:\/\/doi.org\/10.1145\/3534678.3539149","relation":{},"subject":[],"published":{"date-parts":[[2022,8,14]]},"assertion":[{"value":"2022-08-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}