{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T07:56:33Z","timestamp":1772265393618,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,6,27]],"date-time":"2018-06-27T00:00:00Z","timestamp":1530057600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100003593","name":"Conselho Nacional de Desenvolvimento Cient\u00edfico e Tecnol\u00f3gico","doi-asserted-by":"publisher","award":["200869\/2015-9"],"award-info":[{"award-number":["200869\/2015-9"]}],"id":[{"id":"10.13039\/501100003593","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,6,27]]},"DOI":"10.1145\/3209978.3210036","type":"proceedings-article","created":{"date-parts":[[2018,7,2]],"date-time":"2018-07-02T12:12:40Z","timestamp":1530533560000},"page":"35-44","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":139,"title":["Cross-Modal Retrieval in the Cooking Context"],"prefix":"10.1145","author":[{"given":"Micael","family":"Carvalho","sequence":"first","affiliation":[{"name":"Sorbonne Universit\u00e9, CNRS, LIP6, Paris, France"}]},{"given":"R\u00e9mi","family":"Cad\u00e8ne","sequence":"additional","affiliation":[{"name":"Sorbonne Universit\u00e9, CNRS, LIP6, Paris, France"}]},{"given":"David","family":"Picard","sequence":"additional","affiliation":[{"name":"Sorbonne Universit\u00e9, CNRS, LIP6, Paris, France"}]},{"given":"Laure","family":"Soulier","sequence":"additional","affiliation":[{"name":"Sorbonne Universit\u00e9, CNRS, LIP6, Paris, France"}]},{"given":"Nicolas","family":"Thome","sequence":"additional","affiliation":[{"name":"Conservatoire National des Arts et M\u00e9tiers, Paris, France"}]},{"given":"Matthieu","family":"Cord","sequence":"additional","affiliation":[{"name":"Sorbonne Universit\u00e9, CNRS, LIP6, Paris, France"}]}],"member":"320","published-online":{"date-parts":[[2018,6,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"CEA2017: Proceedings of the 9th Workshop on Multimedia for Cooking and Eating Activities in Conjunction with The 2017 International Joint Conference on Artificial Intelligence.","unstructured":"2017. CEA2017: Proceedings of the 9th Workshop on Multimedia for Cooking and Eating Activities in Conjunction with The 2017 International Joint Conference on Artificial Intelligence. 2017. CEA2017: Proceedings of the 9th Workshop on Multimedia for Cooking and Eating Activities in Conjunction with The 2017 International Joint Conference on Artificial Intelligence."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3084142"},{"key":"e_1_3_2_1_3_1","unstructured":"Galen Andrew Raman Arora Jeff Bilmes and Karen Livescu. 2013. Deep canonical correlation analysis. In ICML. 1247--1255.   Galen Andrew Raman Arora Jeff Bilmes and Karen Livescu. 2013. Deep canonical correlation analysis. In ICML. 1247--1255."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1162\/153244303768966085"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2015.117"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Lukas Bossard Matthieu Guillaumin and Luc Van Gool. 2014. Food-101 -- Mining Discriminative Components with Random Forests. In ECCV.  Lukas Bossard Matthieu Guillaumin and Luc Van Gool. 2014. Food-101 -- Mining Discriminative Components with Random Forests. In ECCV.","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964315"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Jingjing Chen Lei Pang and Chong-Wah Ngo. 2017. Cross-Modal Recipe Retrieval: How to Cook this Dish?. In MultiMedia Modeling. 588--600.  Jingjing Chen Lei Pang and Chong-Wah Ngo. 2017. Cross-Modal Recipe Retrieval: How to Cook this Dish?. In MultiMedia Modeling. 588--600.","DOI":"10.1007\/978-3-319-51811-4_48"},{"key":"e_1_3_2_1_9_1","volume-title":"PFID: Pittsburgh fast-food image dataset. In ICIP. 289--292.","author":"Chen M.","year":"2009","unstructured":"M. Chen , K. Dhingra , W. Wu , L. Yang , R. Sukthankar , and J. Yang . 2009 . PFID: Pittsburgh fast-food image dataset. In ICIP. 289--292. M. Chen, K. Dhingra, W. Wu, L. Yang, R. Sukthankar, and J. Yang. 2009. PFID: Pittsburgh fast-food image dataset. In ICIP. 289--292."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080826"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Giovanni Maria Farinella Dario Allegra and Filippo Stanco. 2015. A Benchmark Dataset to Study the Representation of Food Images. 584--599.  Giovanni Maria Farinella Dario Allegra and Filippo Stanco. 2015. A Benchmark Dataset to Study the Representation of Food Images. 584--599.","DOI":"10.1007\/978-3-319-16199-0_41"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.100"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080686"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1080\/00437956.1954.11659520"},{"key":"e_1_3_2_1_15_1","volume-title":"Deep Residual Learning for Image Recognition. arXiv arXiv:1512.03385","author":"He Kaiming","year":"2015","unstructured":"Kaiming He , Xiangyu Zhang , Shaoqing Ren , and Jian Sun . 2015. Deep Residual Learning for Image Recognition. arXiv arXiv:1512.03385 ( 2015 ). Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arXiv arXiv:1512.03385 (2015)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1093\/biomet\/28.3-4.321"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.242"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/860435.860459"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Andrej Karpathy and Li Fei-Fei. 2015. Deep visual-semantic alignments for generating image descriptions. In CVPR. 3128--3137.  Andrej Karpathy and Li Fei-Fei. 2015. Deep visual-semantic alignments for generating image descriptions. In CVPR. 3128--3137.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2638728.2641339"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-04117-9_38"},{"key":"e_1_3_2_1_23_1","volume-title":"Adam: A method for stochastic optimization. arXiv arXiv:1412.6980","author":"Kingma Diederik","year":"2014","unstructured":"Diederik Kingma and Jimmy Ba . 2014 . Adam: A method for stochastic optimization. arXiv arXiv:1412.6980 (2014). Diederik Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_24_1","volume-title":"Unifying visualsemantic embeddings with multimodal neural language models. TACL","author":"Kiros Ryan","year":"2015","unstructured":"Ryan Kiros , Ruslan Salakhutdinov , and Richard S Zemel . 2015. Unifying visualsemantic embeddings with multimodal neural language models. TACL ( 2015 ). Ryan Kiros, Ruslan Salakhutdinov, and Richard S Zemel. 2015. Unifying visualsemantic embeddings with multimodal neural language models. TACL (2015)."},{"key":"e_1_3_2_1_25_1","unstructured":"Ryan Kiros Yukun Zhu Ruslan R Salakhutdinov Richard Zemel Raquel Urtasun Antonio Torralba and Sanja Fidler. 2015. Skip-Thought Vectors. In NIPS. 3294-- 3302.   Ryan Kiros Yukun Zhu Ruslan R Salakhutdinov Richard Zemel Raquel Urtasun Antonio Torralba and Sanja Fidler. 2015. Skip-Thought Vectors. In NIPS. 3294-- 3302."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/2983323.2983897"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2914586.2914632"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1142\/S012906570000034X"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.38"},{"key":"e_1_3_2_1_30_1","volume-title":"Nghia The Pham, and Marco Baroni","author":"Lazaridou Angeliki","year":"2015","unstructured":"Angeliki Lazaridou , Nghia The Pham, and Marco Baroni . 2015 . Combining Language and Vision with a Multimodal Skip-gram Model. In NAACL HLT. 153-- 163. Angeliki Lazaridou, Nghia The Pham, and Marco Baroni. 2015. Combining Language and Vision with a Multimodal Skip-gram Model. In NAACL HLT. 153-- 163."},{"key":"e_1_3_2_1_31_1","unstructured":"Tomas Mikolov Ilya Sutskever Kai Chen Greg S Corrado and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. In NIPS. 3111--3119.   Tomas Mikolov Ilya Sutskever Kai Chen Greg S Corrado and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. In NIPS. 3111--3119."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Amaia Salvador Nicholas Hynes Yusuf Aytar Javier Marin Ferda Ofli Ingmar Weber and Antonio Torralba. 2017. Learning Cross-modal Embeddings for Cooking Recipes and Food Images. In CVPR.  Amaia Salvador Nicholas Hynes Yusuf Aytar Javier Marin Ferda Ofli Ingmar Weber and Antonio Torralba. 2017. Learning Cross-modal Embeddings for Cooking Recipes and Food Images. In CVPR.","DOI":"10.1109\/CVPR.2017.327"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3132847.3133137"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1002\/asi.21659"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052573"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Xin Wang D. Kumar N. Thome M. Cord and F. Precioso. 2015. Recipe recognition with large multimodal food dataset. In ICMEW. 1--6.  Xin Wang D. Kumar N. Thome M. Cord and F. Precioso. 2015. Recipe recognition with large multimodal food dataset. In ICMEW. 1--6.","DOI":"10.1109\/ICMEW.2015.7169757"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.5555\/1577069.1577078"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080678"},{"key":"e_1_3_2_1_41_1","volume-title":"Ng","author":"Xing Eric P.","year":"2003","unstructured":"Eric P. Xing , Michael I. Jordan , Stuart J Russell , and Andrew Y . Ng . 2003 . Distance Metric Learning with Application to Clustering with Side-Information. In NIPS. 521--528. Eric P. Xing, Michael I. Jordan, Stuart J Russell, and Andrew Y. Ng. 2003. Distance Metric Learning with Application to Clustering with Side-Information. In NIPS. 521--528."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Fei Yan and Krystian Mikolajczyk. 2015. Deep correlation for matching images and text. In CVPR. 3441--3450.  Fei Yan and Krystian Mikolajczyk. 2015. Deep correlation for matching images and text. In CVPR. 3441--3450.","DOI":"10.1109\/CVPR.2015.7298966"}],"event":{"name":"SIGIR '18: The 41st International ACM SIGIR conference on research and development in Information Retrieval","location":"Ann Arbor MI USA","acronym":"SIGIR '18","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["The 41st International ACM SIGIR Conference on Research &amp; Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3209978.3210036","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3209978.3210036","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T02:07:49Z","timestamp":1750212469000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3209978.3210036"}},"subtitle":["Learning Semantic Text-Image Embeddings"],"short-title":[],"issued":{"date-parts":[[2018,6,27]]},"references-count":42,"alternative-id":["10.1145\/3209978.3210036","10.1145\/3209978"],"URL":"https:\/\/doi.org\/10.1145\/3209978.3210036","relation":{},"subject":[],"published":{"date-parts":[[2018,6,27]]},"assertion":[{"value":"2018-06-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}