{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:03:00Z","timestamp":1776888180550,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658067","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"211-219","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Anchor-aware Deep Metric Learning for Audio-visual Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6425-6270","authenticated-orcid":false,"given":"Donghuo","family":"Zeng","sequence":"first","affiliation":[{"name":"KDDI Research, Inc., Saitama, JP"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6562-0487","authenticated-orcid":false,"given":"Yanan","family":"Wang","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Saitama, JP"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9563-760X","authenticated-orcid":false,"given":"Kazushi","family":"Ikeda","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Saitama, JP"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0294-6620","authenticated-orcid":false,"given":"Yi","family":"Yu","sequence":"additional","affiliation":[{"name":"Hiroshima University, Hiroshima, JP"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 30th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"1255","author":"Andrew Galen","year":"2013","unstructured":"Galen Andrew, Raman Arora, Jeff Bilmes, and Karen Livescu. 2013. Deep Canonical Correlation Analysis. In Proceedings of the 30th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. Vol.28). PMLR, Atlanta, Georgia, USA, pp.1247--1255."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.290"},{"key":"e_1_3_2_1_3_1","volume-title":"Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290","author":"Cheng Xing","year":"2021","unstructured":"Xing Cheng, Hezheng Lin, Xiangyu Wu, Fan Yang, and Dong Shen. 2021. Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290 (2021)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1162\/0899766042321814"},{"key":"e_1_3_2_1_5_1","volume-title":"The mahalanobis distance. Chemometrics and intelligent laboratory systems 50, 1","author":"Maesschalck Roy De","year":"2000","unstructured":"Roy De Maesschalck, Delphine Jouan-Rimbaud, and D\u00e9sir\u00e9 L Massart. 2000. The mahalanobis distance. Chemometrics and intelligent laboratory systems 50, 1 (2000), 1--18."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00294"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00726"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3323873.3325045"},{"key":"e_1_3_2_1_9_1","volume-title":"Dimensionality reduction by learning an invariant mapping. In 2006 IEEE computer society conference on computer vision and pattern recognition (CVPR'06)","author":"Hadsell Raia","unstructured":"Raia Hadsell, Sumit Chopra, and Yann LeCun. 2006. Dimensionality reduction by learning an invariant mapping. In 2006 IEEE computer society conference on computer vision and pattern recognition (CVPR'06), Vol. 2. IEEE, 1735--1742."},{"key":"e_1_3_2_1_10_1","unstructured":"Ning Han Jingjing Chen Guangyi Xiao Yawen Zeng Chuhao Shi and Hao Chen. 2021. Visual Spatio-temporal Relation-enhanced Network for Cross-modal Text-Video Retrieval. https:\/\/arxiv.org\/abs\/2110.15609"},{"key":"e_1_3_2_1_11_1","unstructured":"Alexander Hermans Lucas Beyer and Bastian Leibe. 2017. In defense of the triplet loss for person re-identification. arXiv preprint arXiv:1703.07737 (2017)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24261-3_7"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.3390\/sym11091066"},{"key":"e_1_3_2_1_14_1","volume-title":"Supervised contrastive learning. Advances in neural information processing systems 33","author":"Khosla Prannay","year":"2020","unstructured":"Prannay Khosla, Piotr Teterwak, ChenWang, Aaron Sarna, Yonglong Tian, Phillip Isola, Aaron Maschinot, Ce Liu, and Dilip Krishnan. 2020. Supervised contrastive learning. Advances in neural information processing systems 33 (2020), 18661-- 18673."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01906"},{"key":"e_1_3_2_1_16_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00728"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Brian Kulis et al. 2013. Metric learning: A survey. Foundations and Trends\u00ae in Machine Learning 5 4 (2013) 287--364.","DOI":"10.1561\/2200000019"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1142\/S012906570000034X"},{"key":"e_1_3_2_1_20_1","volume-title":"2008 IEEE Conference on Computer Vision and Pattern Recognition. IEEE, 1--8.","author":"Lee Jung-Eun","year":"2008","unstructured":"Jung-Eun Lee, Rong Jin, and Anil K Jain. 2008. Rank-based distance metric learning: An application to image retrieval. In 2008 IEEE Conference on Computer Vision and Pattern Recognition. IEEE, 1--8."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20025"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_23"},{"key":"e_1_3_2_1_23_1","volume-title":"Similarity metric learning for a variable-kernel classifier. Neural computation 7, 1","author":"Lowe David G","year":"1995","unstructured":"David G Lowe. 1995. Similarity metric learning for a variable-kernel classifier. Neural computation 7, 1 (1995), 72--85."},{"key":"e_1_3_2_1_24_1","volume-title":"Asian Conference on Computer Vision. 14--pages.","author":"Mignon Alexis","year":"2012","unstructured":"Alexis Mignon and Fr\u00e9d\u00e9ric Jurie. 2012. CMML: A new metric learning approach for cross modal matching. In Asian Conference on Computer Vision. 14--pages."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. Vol.139). PMLR, Virtual Event, pp.8748--8763."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1201\/b18358--8"},{"key":"e_1_3_2_1_27_1","volume-title":"Improved deep metric learning with multi-class n-pair loss objective. Advances in neural information processing systems 29","author":"Sohn Kihyuk","year":"2016","unstructured":"Kihyuk Sohn. 2016. Improved deep metric learning with multi-class n-pair loss objective. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the european conference on computer vision (eccv) workshops. 0--0.","author":"Sur\u00eds Didac","year":"2018","unstructured":"Didac Sur\u00eds, Amanda Duarte, Amaia Salvador, Jordi Torres, and Xavier Gir\u00f3-i Nieto. 2018. Cross-modal embeddings for video and audio retrieval. In Proceedings of the european conference on computer vision (eccv) workshops. 0--0."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-030-01216--8_16"},{"key":"e_1_3_2_1_30_1","volume-title":"It takes two to tango: Mixup for deep metric learning. arXiv preprint arXiv:2106.04990","author":"Venkataramanan Shashanka","year":"2021","unstructured":"Shashanka Venkataramanan, Bill Psomas, Ewa Kijak, Laurent Amsaleg, Konstantinos Karantzalos, and Yannis Avrithis. 2021. It takes two to tango: Mixup for deep metric learning. arXiv preprint arXiv:2106.04990 (2021)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3336191.3371850"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123326"},{"key":"e_1_3_2_1_33_1","volume-title":"VideoAdviser: Video Knowledge Distillation for Multimodal Transfer Learning","author":"Zeng Donghuo","year":"2023","unstructured":"YananWang, Donghuo Zeng, ShinyaWada, and Satoshi Kurihara. 2023. VideoAdviser: Video Knowledge Distillation for Multimodal Transfer Learning. IEEE Access (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"Distance metric learning for large margin nearest neighbor classification. Advances in neural information processing systems 18","author":"Weinberger Kilian Q","year":"2005","unstructured":"Kilian Q Weinberger, John Blitzer, and Lawrence Saul. 2005. Distance metric learning for large margin nearest neighbor classification. Advances in neural information processing systems 18 (2005)."},{"key":"e_1_3_2_1_35_1","volume-title":"Distance metric learning with application to clustering with side-information. Advances in neural information processing systems 15","author":"Xing Eric","year":"2002","unstructured":"Eric Xing, Michael Jordan, Stuart J Russell, and Andrew Ng. 2002. Distance metric learning with application to clustering with side-information. Advances in neural information processing systems 15 (2002)."},{"key":"e_1_3_2_1_36_1","first-page":"1250","volume-title":"Category-based deep CCA for fine-grained venue discovery from multimodal data","author":"Yu Yi","year":"2019","unstructured":"Yi Yu, Suhua Tang, Kiyoharu Aizawa, and Akiko Aizawa. 2019. Category-based deep CCA for fine-grained venue discovery from multimodal data. IEEE transactions on neural networks and learning systems Vol.30, no.4 (2019), pp.1250--1258."},{"key":"e_1_3_2_1_37_1","volume-title":"Two-Stage Triplet Loss Training with Curriculum Augmentation for Audio-Visual Retrieval. arXiv preprint arXiv:2310.13451","author":"Zeng Donghuo","year":"2023","unstructured":"Donghuo Zeng and Kazushi Ikeda. 2023. Two-Stage Triplet Loss Training with Curriculum Augmentation for Audio-Visual Retrieval. arXiv preprint arXiv:2310.13451 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Complete Cross-triplet Loss in Label Space for Audio-visual Cross-modal Retrieval. In 2022 IEEE International Symposium on Multimedia (ISM). IEEE, 1--9.","author":"Zeng Donghuo","year":"2022","unstructured":"Donghuo Zeng, Yanan Wang, Jianming Wu, and Kazushi Ikeda. 2022. Complete Cross-triplet Loss in Label Space for Audio-visual Cross-modal Retrieval. In 2022 IEEE International Symposium on Multimedia (ISM). IEEE, 1--9."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3564608"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISM.2018.00-21"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3387164"},{"key":"e_1_3_2_1_42_1","volume-title":"Multi-scale network with shared cross-attention for audio--visual correlation learning. Neural Computing and Applications","author":"Zhang Jiwei","year":"2023","unstructured":"Jiwei Zhang, Yi Yu, Suhua Tang, Wei Li, and Jianming Wu. 2023. Multi-scale network with shared cross-attention for audio--visual correlation learning. Neural Computing and Applications (2023), 1--15."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575658"},{"key":"e_1_3_2_1_44_1","first-page":"10394","volume-title":"Deep Supervised Cross-Modal Retrieval. In IEEE Conference on Computer Vision and Pattern Recognition. IEEE","author":"Zhen Liangli","year":"2019","unstructured":"Liangli Zhen, Peng Hu, Xu Wang, and Dezhong Peng. 2019. Deep Supervised Cross-Modal Retrieval. In IEEE Conference on Computer Vision and Pattern Recognition. IEEE, Long Beach, CA, USA, pp.10394--10403. https:\/\/doi.org\/10.1109\/ CVPR.2019.01064"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00016"},{"key":"e_1_3_2_1_46_1","first-page":"1","volume-title":"Dual-path convolutional image-text embeddings with instance loss. ACM Transactions on Multimedia Computing, Communications, and Applications","author":"Zheng Zhedong","year":"2020","unstructured":"Zhedong Zheng, Liang Zheng, Michael Garrett, Yi Yang, Mingliang Xu, and Yi-Dong Shen. 2020. Dual-path convolutional image-text embeddings with instance loss. ACM Transactions on Multimedia Computing, Communications, and Applications Vol.16, no.2 (2020), pp.1--23."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00374"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00148"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658067","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658067","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:51:29Z","timestamp":1755766289000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658067"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":48,"alternative-id":["10.1145\/3652583.3658067","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658067","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}