{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:09:51Z","timestamp":1750219791492,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":30,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,7,18]],"date-time":"2023-07-18T00:00:00Z","timestamp":1689638400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,7,19]]},"DOI":"10.1145\/3539618.3591836","type":"proceedings-article","created":{"date-parts":[[2023,7,19]],"date-time":"2023-07-19T00:22:59Z","timestamp":1689726179000},"page":"3275-3279","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["TMML: Text-Guided MuliModal Product Location For Alleviating Retrieval Inconsistency in E-Commerce"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-9037-8128","authenticated-orcid":false,"given":"Youhua","family":"Tang","sequence":"first","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5992-9733","authenticated-orcid":false,"given":"Xiong","family":"Xiong","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3185-8431","authenticated-orcid":false,"given":"Siyang","family":"Sun","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7863-5387","authenticated-orcid":false,"given":"Baoliang","family":"Cui","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5767-014X","authenticated-orcid":false,"given":"Yun","family":"Zheng","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7103-975X","authenticated-orcid":false,"given":"Haihong","family":"Tang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2023,7,18]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"aliyun. 2022. TP Toolbox. https:\/\/ai.aliyun.com\/nlp\/ke."},{"key":"e_1_3_2_2_2_1","volume-title":"Dynamic Head: Unifying Object Detection Heads with Attentions. computer vision and pattern recognition","author":"Dai Xiyang","year":"2021","unstructured":"Xiyang Dai, Yinpeng Chen, Bin Xiao, Dongdong Chen, Mengchen Liu, Lu Yuan, and Lei Zhang. 2021. Dynamic Head: Unifying Object Detection Heads with Attentions. computer vision and pattern recognition (2021)."},{"key":"e_1_3_2_2_3_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. north american","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. north american chapter of the association for computational linguistics (2018)."},{"key":"e_1_3_2_2_4_1","volume-title":"Yolox: Exceeding yolo series in","author":"Ge Zheng","year":"2021","unstructured":"Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun. 2021. Yolox: Exceeding yolo series in 2021. arXiv preprint arXiv:2107.08430 (2021)."},{"key":"e_1_3_2_2_5_1","volume-title":"Contrastive Learning for Weakly Supervised Phrase Grounding. european conference on computer vision","author":"Gupta Tanmay","year":"2020","unstructured":"Tanmay Gupta, Arash Vahdat, Gal Chechik, Xiaodong Yang, Jan Kautz, and Derek Hoiem. 2020. Contrastive Learning for Weakly Supervised Phrase Grounding. european conference on computer vision (2020)."},{"key":"e_1_3_2_2_6_1","volume-title":"Masking: A New Perspective of Noisy Supervision. neural information processing systems","author":"Han Bo","year":"2018","unstructured":"Bo Han, Jiangchao Yao, Gang Niu, Mingyuan Zhou, Ivor W. Tsang, Ya Zhang, and Masashi Sugiyama. 2018. Masking: A New Perspective of Noisy Supervision. neural information processing systems (2018)."},{"key":"e_1_3_2_2_7_1","volume-title":"Co-teaching: Robust Training of Deep Neural Networks with Extremely Noisy Labels. neural information processing systems","author":"Han Bo","year":"2018","unstructured":"Bo Han, Quanming Yao, Xingrui Yu, Gang Niu, Miao Xu, Weihua Hu, Ivor W. Tsang, and Masashi Sugiyama. 2018. Co-teaching: Robust Training of Deep Neural Networks with Extremely Noisy Labels. neural information processing systems (2018)."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Xiao Han Licheng Yu Xiatian Zhu Li Zhang Yi-Zhe Song and Tao Xiang. 2022. FashionViL: Fashion-Focused Vision-and-Language Representation Learning.","DOI":"10.1007\/978-3-031-19833-5_37"},{"key":"e_1_3_2_2_9_1","volume-title":"MDETR -- Modulated Detection for End-to-End Multi-Modal Understanding. international conference on computer vision","author":"Kamath Aishwarya","year":"2021","unstructured":"Aishwarya Kamath, Mannat Singh, Yann LeCun, Ishan Misra, Gabriel Synnaeve, and Nicolas Carion. 2021. MDETR -- Modulated Detection for End-to-End Multi-Modal Understanding. international conference on computer vision (2021)."},{"key":"e_1_3_2_2_10_1","volume-title":"Hoi","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath R. Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven C. H. Hoi. 2021. Align before Fuse: Vision and Language Representation Learning with Momentum Distillation. neural information processing systems (2021)."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Liunian Harold Li Pengchuan Zhang Haotian Zhang Jianwei Yang Chunyuan Li Yiwu Zhong Lijuan Wang Lu Yuan5 Lei Zhang Jenq-Neng Hwang Kai-Wei Chang and Jianfeng Gao. 2022. Grounded Language-Image Pre-training.","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_2_12_1","volume-title":"Feature Pyramid Networks for Object Detection. arXiv: Computer Vision and Pattern Recognition","author":"Lin Tsung-Yi","year":"2016","unstructured":"Tsung-Yi Lin, Piotr Doll\u00e1r, Ross Girshick, Kaiming He, Bharath Hariharan, and Serge Belongie. 2016. Feature Pyramid Networks for Object Detection. arXiv: Computer Vision and Pattern Recognition (2016)."},{"key":"e_1_3_2_2_13_1","volume-title":"Swin Transformer: Hierarchical Vision Transformer using Shifted Windows. international conference on computer vision","author":"Liu Ze","year":"2021","unstructured":"Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, and Baining Guo. 2021. Swin Transformer: Hierarchical Vision Transformer using Shifted Windows. international conference on computer vision (2021)."},{"key":"e_1_3_2_2_14_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_2_15_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_2_17_1","volume-title":"Fully Convolutional One-Stage 3D Object Detection on LiDAR Range Images. arXiv preprint arXiv:2205.13764","author":"Tian Zhi","year":"2022","unstructured":"Zhi Tian, Xiangxiang Chu, Xiaoming Wang, Xiaolin Wei, and Chunhua Shen. 2022. Fully Convolutional One-Stage 3D Object Detection on LiDAR Range Images. arXiv preprint arXiv:2205.13764 (2022)."},{"key":"e_1_3_2_2_18_1","volume-title":"Attention is All you Need. neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. neural information processing systems (2017)."},{"key":"e_1_3_2_2_19_1","volume-title":"Phrase Localization Without Paired Training Examples. international conference on computer vision","author":"Lucia Specia JosiahWang","year":"2019","unstructured":"JosiahWang and Lucia Specia. 2019. Phrase Localization Without Paired Training Examples. international conference on computer vision (2019)."},{"key":"e_1_3_2_2_20_1","volume-title":"Improving Weakly Supervised Visual Grounding by Contrastive Knowledge Distillation. computer vision and pattern recognition","author":"Wang Liwei","year":"2021","unstructured":"Liwei Wang, Jing Huang, Yin Li, Kun Xu, Zhengyuan Yang, and Dong Yu. 2021. Improving Weakly Supervised Visual Grounding by Contrastive Knowledge Distillation. computer vision and pattern recognition (2021)."},{"key":"e_1_3_2_2_21_1","volume-title":"Open-set Label Noise Can Improve Robustness Against Inherent Label Noise. arXiv: Learning","author":"Wei Hongxin","year":"2021","unstructured":"Hongxin Wei, Lue Tao, Renchunzi Xie, and Bo An. 2021. Open-set Label Noise Can Improve Robustness Against Inherent Label Noise. arXiv: Learning (2021)."},{"key":"e_1_3_2_2_22_1","unstructured":"Jiannan Wu Yi Jiang Peize Sun Zehuan Yuan and Ping Luo. 2022. Language as Queries for Referring Video Object Segmentation."},{"key":"e_1_3_2_2_23_1","volume-title":"Yury Bubnov, Leon Stein, Qiaosong Wang, M. Hadi Kiapour, and Robinson Piramuthu.","author":"Yang Fan","year":"2017","unstructured":"Fan Yang, Ajinkya Gorakhnath Kale, Yury Bubnov, Leon Stein, Qiaosong Wang, M. Hadi Kiapour, and Robinson Piramuthu. 2017. Visual Search at eBay. knowledge discovery and data mining (2017)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00119"},{"key":"e_1_3_2_2_25_1","volume-title":"Schwing","author":"Yeh Raymond A.","year":"2018","unstructured":"Raymond A. Yeh, Minh N. Do, and Alexander G. Schwing. 2018. Unsupervised Textual Grounding: Linking Words to Image Concepts. computer vision and pattern recognition (2018)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539151"},{"key":"e_1_3_2_2_27_1","volume-title":"Product1M: Towards Weakly Supervised Instance-Level Product Retrieval via Cross-Modal Pretraining. international conference on computer vision","author":"Zhan Xunlin","year":"2021","unstructured":"Xunlin Zhan, Yangxin Wu, Xiao Dong, Yunchao Wei, Minlong Lu, Yichi Zhang, Hang Xu, and Xiaodan Liang. 2021. Product1M: Towards Weakly Supervised Instance-Level Product Retrieval via Cross-Modal Pretraining. international conference on computer vision (2021)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00978"},{"key":"e_1_3_2_2_29_1","unstructured":"Yanhao Zhang Pan Pan Yun Zheng Kang Zhao Yingya Zhang Xiaofeng Ren and Rong Jin. 2022. Visual Search at Alibaba."},{"key":"e_1_3_2_2_30_1","volume-title":"Weakly Supervised Phrase Localization with Multi-scale Anchored Transformer Network. computer vision and pattern recognition","author":"Zhao Fang","year":"2018","unstructured":"Fang Zhao, Jianshu Li, Jian Zhao, and Jiashi Feng. 2018. Weakly Supervised Phrase Localization with Multi-scale Anchored Transformer Network. computer vision and pattern recognition (2018)."}],"event":{"name":"SIGIR '23: The 46th International ACM SIGIR Conference on Research and Development in Information Retrieval","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Taipei Taiwan","acronym":"SIGIR '23"},"container-title":["Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3591836","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3539618.3591836","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:37:59Z","timestamp":1750178279000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3591836"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,18]]},"references-count":30,"alternative-id":["10.1145\/3539618.3591836","10.1145\/3539618"],"URL":"https:\/\/doi.org\/10.1145\/3539618.3591836","relation":{},"subject":[],"published":{"date-parts":[[2023,7,18]]},"assertion":[{"value":"2023-07-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}