{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T17:51:55Z","timestamp":1781891515460,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research Program","award":["2018YFB1402800"],"award-info":[{"award-number":["2018YFB1402800"]}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["C91846204\/U19B2027"],"award-info":[{"award-number":["C91846204\/U19B2027"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475648","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T06:21:10Z","timestamp":1634538070000},"page":"2744-2752","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["Knowledge Perceived Multi-modal Pretraining in E-commerce"],"prefix":"10.1145","author":[{"given":"Yushan","family":"Zhu","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Huaixiao","family":"Zhao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ganqiang","family":"Ye","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hui","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ningyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Huajun","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1108\/K-03-2019-0199"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240323.3240367"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308560.3316602"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366424.3383295"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE51399.2021.00291"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_2_7_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. CoRR, abs\/1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li , Mark Yatskar , Da Yin , Cho-Jui Hsieh , and Kai-Wei Chang . Visualbert: A simple and performant baseline for vision and language. CoRR, abs\/1908.03557 , 2019 . Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. Visualbert: A simple and performant baseline for vision and language. CoRR, abs\/1908.03557, 2019."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"e_1_3_2_2_9_1","volume-title":"ICLR. OpenReview.net","author":"Su Weijie","year":"2020","unstructured":"Weijie Su , Xizhou Zhu , Yue Cao , Bin Li , Lewei Lu , Furu Wei , and Jifeng Dai . VL-BERT: pre-training of generic visual-linguistic representations . In ICLR. OpenReview.net , 2020 . Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. VL-BERT: pre-training of generic visual-linguistic representations. In ICLR. OpenReview.net, 2020."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401430"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454289"},{"key":"e_1_3_2_2_13_1","volume-title":"Interbert: Vision-and-language interaction for multi-modal pretraining. CoRR, abs\/2003.13198","author":"Lin Junyang","year":"2020","unstructured":"Junyang Lin , An Yang , Yichang Zhang , Jie Liu , Jingren Zhou , and Hongxia Yang . Interbert: Vision-and-language interaction for multi-modal pretraining. CoRR, abs\/2003.13198 , 2020 . Junyang Lin, An Yang, Yichang Zhang, Jie Liu, Jingren Zhou, and Hongxia Yang. Interbert: Vision-and-language interaction for multi-modal pretraining. CoRR, abs\/2003.13198, 2020."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1219"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413538"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE51399.2021.00280"},{"key":"e_1_3_2_2_18_1","volume-title":"ICLR","author":"Simonyan Karen","year":"2015","unstructured":"Karen Simonyan and Andrew Zisserman . Very deep convolutional networks for large-scale image recognition . In ICLR , 2015 . Karen Simonyan and Andrew Zisserman. Very deep convolutional networks for large-scale image recognition. In ICLR, 2015."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_2_21_1","first-page":"4171","volume-title":"NAACL-HLT (1)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . BERT: pre-training of deep bidirectional transformers for language understanding . In NAACL-HLT (1) , pages 4171 -- 4186 . Association for Computational Linguistics , 2019 . Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. BERT: pre-training of deep bidirectional transformers for language understanding. In NAACL-HLT (1), pages 4171--4186. Association for Computational Linguistics, 2019."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454804"},{"key":"e_1_3_2_2_23_1","volume-title":"NeurIPS","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown , Benjamin Mann , Nick Ryder , Melanie Subbiah , Jared Kaplan , Prafulla Dhariwal , Arvind Neelakantan , Pranav Shyam , Girish Sastry , Amanda Askell , Sandhini Agarwal , Ariel Herbert-Voss , Gretchen Krueger , Tom Henighan , Rewon Child , Aditya Ramesh , Daniel M. Ziegler , Jeffrey Wu , Clemens Winter , Christopher Hesse , Mark Chen , Eric Sigler , Mateusz Litwin , Scott Gray , Benjamin Chess , Jack Clark , Christopher Berner , Sam McCandlish , Alec Radford , Ilya Sutskever , and Dario Amodei . Language models are few-shot learners . In NeurIPS , 2020 . Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. Language models are few-shot learners. In NeurIPS, 2020."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i03.5681"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1139"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00360"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1005"},{"key":"e_1_3_2_2_29_1","volume-title":"et al. K-adapter: Infusing knowledge into pre-trained models with adapters. arXiv preprint arXiv:2002.01808","author":"Wang Ruize","year":"2020","unstructured":"Ruize Wang , Duyu Tang , Nan Duan , Zhongyu Wei , Xuanjing Huang , Cuihong Cao , Daxin Jiang , Ming Zhou , et al. K-adapter: Infusing knowledge into pre-trained models with adapters. arXiv preprint arXiv:2002.01808 , 2020 . Ruize Wang, Duyu Tang, Nan Duan, Zhongyu Wei, Xuanjing Huang, Cuihong Cao, Daxin Jiang, Ming Zhou, et al. K-adapter: Infusing knowledge into pre-trained models with adapters. arXiv preprint arXiv:2002.01808, 2020."},{"key":"e_1_3_2_2_30_1","volume-title":"Contextual knowledge selection and embedding towards enhanced pre-trained language models. arXiv preprint arXiv:2009.13964","author":"Su YuSheng","year":"2020","unstructured":"YuSheng Su , Xu Han , Zhengyan Zhang , Peng Li , Zhiyuan Liu , Yankai Lin , Jie Zhou , and Maosong Sun . Contextual knowledge selection and embedding towards enhanced pre-trained language models. arXiv preprint arXiv:2009.13964 , 2020 . YuSheng Su, Xu Han, Zhengyan Zhang, Peng Li, Zhiyuan Liu, Yankai Lin, Jie Zhou, and Maosong Sun. Contextual knowledge selection and embedding towards enhanced pre-trained language models. arXiv preprint arXiv:2009.13964, 2020."},{"key":"e_1_3_2_2_31_1","volume-title":"Jaket: Joint pre-training of knowledge graph and language understanding. arXiv preprint arXiv:2010.00796","author":"Yu Donghan","year":"2020","unstructured":"Donghan Yu , Chenguang Zhu , Yiming Yang , and Michael Zeng . Jaket: Joint pre-training of knowledge graph and language understanding. arXiv preprint arXiv:2010.00796 , 2020 . Donghan Yu, Chenguang Zhu, Yiming Yang, and Michael Zeng. Jaket: Joint pre-training of knowledge graph and language understanding. arXiv preprint arXiv:2010.00796, 2020."},{"key":"e_1_3_2_2_32_1","volume-title":"Knowledge-aware language model pretraining. arXiv preprint arXiv:2007.00655","author":"Rosset Corby","year":"2020","unstructured":"Corby Rosset , Chenyan Xiong , Minh Phan , Xia Song , Paul Bennett , and Saurabh Tiwary . Knowledge-aware language model pretraining. arXiv preprint arXiv:2007.00655 , 2020 . Corby Rosset, Chenyan Xiong, Minh Phan, Xia Song, Paul Bennett, and Saurabh Tiwary. Knowledge-aware language model pretraining. arXiv preprint arXiv:2007.00655, 2020."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.244"},{"key":"e_1_3_2_2_34_1","volume-title":"Anna Korhonen, and Goran Glavavs. Informing unsupervised pretraining with external linguistic knowledge. arXiv preprint arXiv:1909.02339","author":"Lauscher Anne","year":"2019","unstructured":"Anne Lauscher , Ivan Vuli\u0107 , Edoardo Maria Ponti , Anna Korhonen, and Goran Glavavs. Informing unsupervised pretraining with external linguistic knowledge. arXiv preprint arXiv:1909.02339 , 2019 . Anne Lauscher, Ivan Vuli\u0107, Edoardo Maria Ponti, Anna Korhonen, and Goran Glavavs. Informing unsupervised pretraining with external linguistic knowledge. arXiv preprint arXiv:1909.02339, 2019."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969239.2969250"},{"key":"e_1_3_2_2_36_1","volume-title":"Google's neural machine translation system: Bridging the gap between human and machine translation. CoRR, abs\/1609.08144","author":"Wu Yonghui","year":"2016","unstructured":"Yonghui Wu , Mike Schuster , Zhifeng Chen , Quoc V. Le , Mohammad Norouzi , Wolfgang Macherey , Maxim Krikun , Yuan Cao , Qin Gao , Klaus Macherey , Jeff Klingner , Apurva Shah , Melvin Johnson , Xiaobing Liu , Lukasz Kaiser , Stephan Gouws , Yoshikiyo Kato , Taku Kudo , Hideto Kazawa , Keith Stevens , George Kurian , Nishant Patil , Wei Wang , Cliff Young , Jason Smith , Jason Riesa , Alex Rudnick , Oriol Vinyals , Greg Corrado , Macduff Hughes , and Jeffrey Dean . Google's neural machine translation system: Bridging the gap between human and machine translation. CoRR, abs\/1609.08144 , 2016 . Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser, Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens, George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, and Jeffrey Dean. Google's neural machine translation system: Bridging the gap between human and machine translation. CoRR, abs\/1609.08144, 2016."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01079"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1466"},{"key":"e_1_3_2_2_39_1","volume-title":"ICLR (Poster). OpenReview.net","author":"Velickovic Petar","year":"2018","unstructured":"Petar Velickovic , Guillem Cucurull , Arantxa Casanova , Adriana Romero , Pietro Li\u00f2 , and Yoshua Bengio . Graph attention networks . In ICLR (Poster). OpenReview.net , 2018 . Petar Velickovic, Guillem Cucurull, Arantxa Casanova, Adriana Romero, Pietro Li\u00f2, and Yoshua Bengio. Graph attention networks. In ICLR (Poster). OpenReview.net, 2018."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.5555\/2999792.2999923"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403182"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475648","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475648","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475648","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:24Z","timestamp":1750193304000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475648"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":41,"alternative-id":["10.1145\/3474085.3475648","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475648","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}