{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T13:56:13Z","timestamp":1770818173147,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,7,11]],"date-time":"2021-07-11T00:00:00Z","timestamp":1625961600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61976049, 61632007, U20B2063"],"award-info":[{"award-number":["61976049, 61632007, U20B2063"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Sichuan Science and Technology Program","award":["2018GZDZX0032, 2019ZDZX0008, 2019YFG0003 and 2019YFG0533"],"award-info":[{"award-number":["2018GZDZX0032, 2019ZDZX0008, 2019YFG0003 and 2019YFG0533"]}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["ZYGX2019Z015"],"award-info":[{"award-number":["ZYGX2019Z015"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,7,11]]},"DOI":"10.1145\/3404835.3462965","type":"proceedings-article","created":{"date-parts":[[2021,7,12]],"date-time":"2021-07-12T02:41:54Z","timestamp":1626057714000},"page":"244-254","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":17,"title":["Hybrid Fusion with Intra- and Cross-Modality Attention for Image-Recipe Retrieval"],"prefix":"10.1145","author":[{"given":"Jiao","family":"Li","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xing","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Yu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and\u00a0Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fumin","family":"Shen","sequence":"additional","affiliation":[{"name":"University of Electronic Science and\u00a0Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zuo","family":"Cao","sequence":"additional","affiliation":[{"name":"MEITUAN, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kai","family":"Zuo","sequence":"additional","affiliation":[{"name":"MEITUAN, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Heng Tao","family":"Shen","sequence":"additional","affiliation":[{"name":"University of Electronic Science and\u00a0Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,7,11]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"L\u00e9on Bottou. 2010. Large-scale machine learning with stochastic gradient descent. In in COMPSTAT.","DOI":"10.1007\/978-3-7908-2604-3_16"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210036"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964315"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123428"},{"key":"e_1_3_2_2_5_1","volume-title":"Deep Understanding of Cooking Procedure for Cross-modal Recipe Retrieval. In 2018 ACM Multimedia Conference on Multimedia Conference, MM. 1020--1028","author":"Chen Jingjing","year":"2018","unstructured":"Jingjing Chen, Chong-Wah Ngo, Fuli Feng, and Tat-Seng Chua. 2018. Deep Understanding of Cooking Procedure for Cross-modal Recipe Retrieval. In 2018 ACM Multimedia Conference on Multimedia Conference, MM. 1020--1028."},{"key":"e_1_3_2_2_6_1","volume-title":"MultiMedia Modeling - 23rd International Conference, MMM. 588--600.","author":"Chen Jingjing","unstructured":"Jingjing Chen, Lei Pang, and Chong-Wah Ngo. 2017b. Cross-Modal Recipe Retrieval: How to Cook this Dish?. In MultiMedia Modeling - 23rd International Conference, MMM. 588--600."},{"key":"e_1_3_2_2_7_1","volume-title":"KyungHyun Cho, and Yoshua Bengio.","author":"Chung Junyoung","year":"2014","unstructured":"Junyoung Chung, cC aglar G\u00fc lcc ehre, KyungHyun Cho, and Yoshua Bengio. 2014. Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling. CoRR, Vol. abs\/1412.3555 (2014)."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080826"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080826"},{"key":"e_1_3_2_2_11_1","volume-title":"British Machine Vision Conference","author":"Faghri Fartash","year":"2018","unstructured":"Fartash Faghri, David J. Fleet, Jamie Kiros, and Sanja Fidler. 2018. VSE+: Improving Visual-Semantic Embeddings with Hard Negatives. In British Machine Vision Conference 2018. 12."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654902"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/1719970.1720021"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01458"},{"key":"e_1_3_2_2_15_1","volume-title":"Generative Adversarial Nets. In Annual Conference on Neural Information Processing Systems","author":"Goodfellow Ian J.","year":"2014","unstructured":"Ian J. Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron C. Courville, and Yoshua Bengio. 2014. Generative Adversarial Nets. In Annual Conference on Neural Information Processing Systems 2014. 2672--2680."},{"key":"e_1_3_2_2_16_1","volume-title":"Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR. 770--778","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR. 770--778."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1093\/biomet\/28.3-4.321"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806240"},{"key":"e_1_3_2_2_19_1","volume-title":"Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR, Yoshua Bengio and Yann LeCun (Eds.).","author":"Diederik","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR, Yoshua Bengio and Yann LeCun (Eds.)."},{"key":"e_1_3_2_2_20_1","volume-title":"Skip-Thought Vectors. In Annual Conference on Neural Information Processing Systems","author":"Kiros Ryan","year":"2015","unstructured":"Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Skip-Thought Vectors. In Annual Conference on Neural Information Processing Systems 2015. 3294--3302."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2983323.2983897"},{"key":"e_1_3_2_2_22_1","volume-title":"Stacked Cross Attention for Image-Text Matching. In 15th European Conference on Computer Vision (ECCV). 212--228","author":"Lee Kuang-Huei","year":"2018","unstructured":"Kuang-Huei Lee, Xi Chen, Gang Hua, Houdong Hu, and Xiaodong He. 2018a. Stacked Cross Attention for Image-Text Matching. In 15th European Conference on Computer Vision (ECCV). 212--228."},{"key":"e_1_3_2_2_23_1","volume-title":"CleanNet: Transfer Learning for Scalable Image Classifier Training With Label Noise. In 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR. 5447--5456","author":"Lee Kuang-Huei","year":"2018","unstructured":"Kuang-Huei Lee, Xiaodong He, Lei Zhang, and Linjun Yang. 2018c. CleanNet: Transfer Learning for Scalable Image Classifier Training With Label Noise. In 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR. 5447--5456."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_2_25_1","volume-title":"DeepFood: Deep Learning-Based Food Image Recognition for Computer-Aided Dietary Assessment. In 14th International Conference on Smart Homes and Health Telematics, ICOST","author":"Liu Chang","year":"2016","unstructured":"Chang Liu, Yu Cao, Yan Luo, Guanling Chen, Vinod Vokkarane, and Yunsheng Ma. 2016. DeepFood: Deep Learning-Based Food Image Recognition for Computer-Aided Dietary Assessment. In 14th International Conference on Smart Homes and Health Telematics, ICOST 2016. 37--48."},{"key":"e_1_3_2_2_26_1","volume-title":"ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Annual Conference on Neural Information Processing Systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Annual Conference on Neural Information Processing Systems 2019. 13--23."},{"key":"e_1_3_2_2_27_1","volume-title":"Wide-Slice Residual Networks for Food Recognition. In 2018 IEEE Winter Conference on Applications of Computer Vision, WACV. 567--576","author":"Martinel Niki","year":"2018","unstructured":"Niki Martinel, Gian Luca Foresti, and Christian Micheloni. 2018. Wide-Slice Residual Networks for Food Recognition. In 2018 IEEE Winter Conference on Applications of Computer Vision, WACV. 567--576."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2015.70"},{"key":"e_1_3_2_2_29_1","volume-title":"Fine-grained Visual Textual Alignment for Cross-Modal Retrieval using Transformer Encoders. CoRR","author":"Messina Nicola","year":"2020","unstructured":"Nicola Messina, Giuseppe Amato, Andrea Esuli, Fabrizio Falchi, Claudio Gennaro, and St\u00e9 phane Marchand-Maillet. 2020. Fine-grained Visual Textual Alignment for Cross-Modal Retrieval using Transformer Encoders. CoRR, Vol. abs\/2008.05231 (2020). arxiv: 2008.05231"},{"key":"e_1_3_2_2_30_1","volume-title":"Annual Conference on Neural Information Processing Systems","author":"Mikolov Tom\u00e1s","year":"2013","unstructured":"Tom\u00e1s Mikolov, Ilya Sutskever, Kai Chen, Gregory S. Corrado, and Jeffrey Dean. 2013. Distributed Representations of Words and Phrases and their Compositionality. In Annual Conference on Neural Information Processing Systems 2013. 3111--3119."},{"key":"e_1_3_2_2_31_1","first-page":"950","article-title":"You Are What You Eat","volume":"20","author":"Min Weiqing","year":"2018","unstructured":"Weiqing Min, Bing-Kun Bao, Shuhuan Mei, Yaohui Zhu, Yong Rui, and Shuqiang Jiang. 2018. You Are What You Eat: Exploring Rich Recipe Information for Cross-Region Food Analysis. IEEE Trans. Multim., Vol. 20, 4 (2018), 950--964.","journal-title":"Exploring Rich Recipe Information for Cross-Region Food Analysis. IEEE Trans. Multim."},{"key":"e_1_3_2_2_32_1","volume-title":"Jain","author":"Min Weiqing","year":"2019","unstructured":"Weiqing Min, Shuqiang Jiang, Linhu Liu, Yong Rui, and Ramesh C. Jain. 2019. A Survey on Food Computing. ACM Comput. Surv., Vol. 52, 5 (2019), 92:1--92:36."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2639382"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.146"},{"key":"e_1_3_2_2_35_1","volume-title":"Proceedings of the 28th International Conference on Machine Learning, ICML, Lise Getoor and Tobias Scheffer (Eds.). 689--696","author":"Ngiam Jiquan","unstructured":"Jiquan Ngiam, Aditya Khosla, Mingyu Kim, Juhan Nam, Honglak Lee, and Andrew Y. Ng. 2011. Multimodal Deep Learning. In Proceedings of the 28th International Conference on Machine Learning, ICML, Lise Getoor and Tobias Scheffer (Eds.). 689--696."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052663"},{"key":"e_1_3_2_2_37_1","volume-title":"Context-Aware Multi-View Summarization Network for Image-Text Matching. In MM '20: The 28th ACM International Conference on Multimedia, Virtual Event \/ Seattle, WA, USA, October 12--16","author":"Qu Leigang","year":"2020","unstructured":"Leigang Qu, Meng Liu, Da Cao, Liqiang Nie, and Qi Tian. 2020. Context-Aware Multi-View Summarization Network for Image-Text Matching. In MM '20: The 28th ACM International Conference on Multimedia, Virtual Event \/ Seattle, WA, USA, October 12--16, 2020. 1047--1055."},{"key":"e_1_3_2_2_38_1","volume-title":"Cross-Modal Hierarchical Modelling for Fine-Grained Sketch Based Image Retrieval. In 31st British Machine Vision Conference","author":"Sain Aneeshan","year":"2020","unstructured":"Aneeshan Sain, Ayan Kumar Bhunia, Yongxin Yang, Tao Xiang, and Yi-Zhe Song. 2020. Cross-Modal Hierarchical Modelling for Fine-Grained Sketch Based Image Retrieval. In 31st British Machine Vision Conference 2020, BMVC."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3041021.3055137"},{"key":"e_1_3_2_2_40_1","volume-title":"Learning Cross-Modal Embeddings for Cooking Recipes and Food Images. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR. 3068--3076","author":"Salvador Amaia","year":"2017","unstructured":"Amaia Salvador, Nicholas Hynes, Yusuf Aytar, Javier Mar'i n, Ferda Ofli, Ingmar Weber, and Antonio Torralba. 2017. Learning Cross-Modal Embeddings for Cooking Recipes and Food Images. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR. 3068--3076."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052573"},{"key":"e_1_3_2_2_42_1","volume-title":"Annual Conference on Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Annual Conference on Neural Information Processing Systems 2017. 5998--6008."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123326"},{"key":"e_1_3_2_2_44_1","volume-title":"Learning Cross-Modal Embeddings With Adversarial Networks for Cooking Recipes and Food Images. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR. 11572--11581","author":"Wang Hao","unstructured":"Hao Wang, Doyen Sahoo, Chenghao Liu, Ee-Peng Lim, and Steven C. H. Hoi. 2019 b. Learning Cross-Modal Embeddings With Adversarial Networks for Cooking Recipes and Food Images. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR. 11572--11581."},{"key":"e_1_3_2_2_45_1","volume-title":"Hoi","author":"Wang Hao","year":"2020","unstructured":"Hao Wang, Doyen Sahoo, Chenghao Liu, Ke Shu, Palakorn Achananuparp, Ee-Peng Lim, and Steven C. H. Hoi. 2020. Cross-Modal Food Retrieval: Learning a Joint Embedding of Food Images and Recipes with Semantic Consistency and Attention Mechanism. CoRR, Vol. abs\/2003.03955 (2020). arxiv: 2003.03955"},{"key":"e_1_3_2_2_46_1","volume-title":"CAMP: Cross-Modal Adaptive Message Passing for Text-Image Retrieval. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV. 5763--5772","author":"Wang Zihao","year":"2019","unstructured":"Zihao Wang, Xihui Liu, Hongsheng Li, Lu Sheng, Junjie Yan, Xiaogang Wang, and Jing Shao. 2019 a. CAMP: Cross-Modal Adaptive Message Passing for Text-Image Retrieval. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV. 5763--5772."},{"key":"e_1_3_2_2_47_1","volume-title":"Learning Dual Semantic Relations with Graph Attention for Image-Text Matching. CoRR","author":"Wen Keyu","year":"2020","unstructured":"Keyu Wen, Xiaodong Gu, and Qingrong Cheng. 2020. Learning Dual Semantic Relations with Graph Attention for Image-Text Matching. CoRR, Vol. abs\/2010.11550 (2020). arxiv: 2010.11550"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-018-0541-x"},{"key":"e_1_3_2_2_49_1","volume-title":"2020 a. Joint Feature Synthesis and Embedding: Adversarial Cross-modal Retrieval Revisited","author":"Xu X.","year":"2020","unstructured":"X. Xu, K. Lin, Y. Yang, A. Hanjalic, and H. Shen. 2020 a. Joint Feature Synthesis and Embedding: Adversarial Cross-modal Retrieval Revisited. IEEE Transactions on Pattern Analysis & Machine Intelligence (2020), 1--18."},{"key":"e_1_3_2_2_50_1","unstructured":"X. Xu H. Lu J. Song Y. Yang H. T. Shen and X. Li. 2019. Ternary Adversarial Networks With Self-Supervision for Zero-Shot Cross-Modal Retrieval. IEEE Transactions on Cybernetics (2019) 1--14."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2967597"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072614"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372278.3390681"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/2872427.2882995"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.127"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01174"}],"event":{"name":"SIGIR '21: The 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Virtual Event Canada","acronym":"SIGIR '21","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462965","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3404835.3462965","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:18:20Z","timestamp":1750191500000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462965"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,11]]},"references-count":56,"alternative-id":["10.1145\/3404835.3462965","10.1145\/3404835"],"URL":"https:\/\/doi.org\/10.1145\/3404835.3462965","relation":{},"subject":[],"published":{"date-parts":[[2021,7,11]]},"assertion":[{"value":"2021-07-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}