{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:22Z","timestamp":1750309522639,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680804","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"9601-9610","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Spatiotemporal Graph Guided Multi-modal Network for Livestreaming Product Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7364-4489","authenticated-orcid":false,"given":"Xiaowan","family":"Hu","sequence":"first","affiliation":[{"name":"The Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3393-1774","authenticated-orcid":false,"given":"Yiyi","family":"Chen","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1882-3331","authenticated-orcid":false,"given":"Yan","family":"Li","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1592-6473","authenticated-orcid":false,"given":"Minquan","family":"Wang","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2792-8469","authenticated-orcid":false,"given":"Haoqian","family":"Wang","sequence":"additional","affiliation":[{"name":"The Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4865-2396","authenticated-orcid":false,"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9801-9292","authenticated-orcid":false,"given":"Han","family":"Li","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9266-0780","authenticated-orcid":false,"given":"Peng","family":"Jiang","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"4","article-title":"Is space-time attention all you need for video understanding?","volume":"2","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In ICML, Vol. 2. 4.","journal-title":"ICML"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00052"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3499027"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.444"},{"key":"e_1_3_2_1_7_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_8_1","volume-title":"Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097","author":"Fang Han","year":"2021","unstructured":"Han Fang, Pengfei Xiong, Luhui Xu, and Yu Chen. 2021. Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00243"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00548"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00059"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548020"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.382"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.127"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123429"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-021-3367-y"},{"key":"e_1_3_2_1_18_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00316"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_2_1_22_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"e_1_3_2_1_24_1","volume-title":"Yu-Chiang Frank Wang, and Shao-Yi Chien","author":"Liu Chih-Ting","year":"2019","unstructured":"Chih-Ting Liu, Chih-Wei Wu, Yu-Chiang Frank Wang, and Shao-Yi Chien. 2019. Spatially and temporally efficient non-local attention network for video-based person re-identification. arXiv preprint arXiv:1908.01683 (2019)."},{"key":"e_1_3_2_1_25_1","volume-title":"Cma-clip: Cross-modality attention clip for image-text classification. arXiv preprint arXiv:2112.03562","author":"Liu Huidong","year":"2021","unstructured":"Huidong Liu, Shaoyuan Xu, Jinmiao Fu, Yang Liu, Ning Xie, Chien-Chih Wang, Bryan Wang, and Yi Sun. 2021. Cma-clip: Cross-modality attention clip for image-text classification. arXiv preprint arXiv:2112.03562 (2021)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.124"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_28_1","volume-title":"Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983","author":"Loshchilov Ilya","year":"2016","unstructured":"Ilya Loshchilov and Frank Hutter. 2016. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)."},{"key":"e_1_3_2_1_29_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_34_1","volume-title":"International Conference on Machine Learning. PMLR, 9410--9421","author":"Seidenschwarz Jenny Denise","year":"2021","unstructured":"Jenny Denise Seidenschwarz, Ismail Elezi, and Laura Leal-Taix\u00e9. 2021. Learning intra-batch connections for deep metric learning. In International Conference on Machine Learning. PMLR, 9410--9421."},{"key":"e_1_3_2_1_35_1","volume-title":"Vl-bert: Pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530","author":"Su Weijie","year":"2019","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2019. Vl-bert: Pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530 (2019)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_37_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2014.2320500"},{"key":"e_1_3_2_1_39_1","volume-title":"Graph attention networks. arXiv preprint arXiv:1710.10903","author":"Velivckovi\u0107 Petar","year":"2017","unstructured":"Petar Velivckovi\u0107, Guillem Cucurull, Arantxa Casanova, Adriana Romero, Pietro Lio, and Yoshua Bengio. 2017. Graph attention networks. arXiv preprint arXiv:1710.10903 (2017)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3011288"},{"key":"e_1_3_2_1_41_1","volume-title":"Learning dual semantic relations with graph attention for image-text matching","author":"Wen Keyu","year":"2020","unstructured":"Keyu Wen, Xiaodong Gu, and Qingrong Cheng. 2020. Learning dual semantic relations with graph attention for image-text matching. IEEE transactions on circuits and systems for video technology, Vol. 31, 7 (2020), 2866--2879."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6941"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00297"},{"key":"e_1_3_2_1_44_1","volume-title":"Chinese clip: Contrastive vision-language pretraining in chinese. arXiv preprint arXiv:2211.01335","author":"Yang An","year":"2022","unstructured":"An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, and Chang Zhou. 2022. Chinese clip: Contrastive vision-language pretraining in chinese. arXiv preprint arXiv:2211.01335 (2022)."},{"key":"e_1_3_2_1_45_1","volume-title":"Cross-view Semantic Alignment for Livestreaming Product Recognition. arXiv preprint arXiv:2308.04912","author":"Yang Wenjie","year":"2023","unstructured":"Wenjie Yang, Yiyi Chen, Yan Li, Yanhua Cheng, Xudong Liu, Quan Chen, and Han Li. 2023. Cross-view Semantic Alignment for Livestreaming Product Recognition. arXiv preprint arXiv:2308.04912 (2023)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the Twenty-Ninth International Conference on International Joint Conferences on Artificial Intelligence. 1054--1060","author":"Zhao Hongrui","year":"2021","unstructured":"Hongrui Zhao, Jin Yu, Yanan Li, Donghui Wang, Jie Liu, Hongxia Yang, and Fei Wu. 2021. Dress like an internet celebrity: Fashion retrieval in videos. In Proceedings of the Twenty-Ninth International Conference on International Joint Conferences on Artificial Intelligence. 1054--1060."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655035"},{"key":"e_1_3_2_1_49_1","volume-title":"Graph neural networks: A review of methods and applications. AI open","author":"Zhou Jie","year":"2020","unstructured":"Jie Zhou, Ganqu Cui, Shengding Hu, Zhengyan Zhang, Cheng Yang, Zhiyuan Liu, Lifeng Wang, Changcheng Li, and Maosong Sun. 2020. Graph neural networks: A review of methods and applications. AI open, Vol. 1 (2020), 57--81."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583286"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680804","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680804","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:07Z","timestamp":1750295887000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680804"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":51,"alternative-id":["10.1145\/3664647.3680804","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680804","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}