{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T15:43:54Z","timestamp":1757778234669,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,7]],"date-time":"2023-06-07T00:00:00Z","timestamp":1686096000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,7]]},"DOI":"10.1145\/3587819.3590989","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T17:24:19Z","timestamp":1686245059000},"page":"257-265","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Multimodal Cascaded Framework with Metric Learning Robust to Missing Modalities for Person Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9553-0906","authenticated-orcid":false,"given":"Vijay","family":"John","sequence":"first","affiliation":[{"name":"RIKEN, Seika-cho, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3799-4550","authenticated-orcid":false,"given":"Yasutomo","family":"Kawanishi","sequence":"additional","affiliation":[{"name":"RIKEN, Seika-cho, Japan"}]}],"member":"320","published-online":{"date-parts":[[2023,6,8]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"SpeakingFaces: A Large-Scale Multimodal Dataset of Voice Commands with Visual and Thermal Video Streams. CoRR abs\/2012.02961","author":"Abdrakhmanova Madina","year":"2020","unstructured":"Madina Abdrakhmanova, Askat Kuzdeuov, Sheikh Jarju, Yerbolat Khassanov, Michael Lewis, and Huseyin Atakan Varol. 2020. SpeakingFaces: A Large-Scale Multimodal Dataset of Voice Commands with Visual and Thermal Video Streams. CoRR abs\/2012.02961 (2020)."},{"key":"e_1_3_2_1_2_1","unstructured":"K. Bayoudh R. Knani F. Hamdaoui and A. Mtibaa. 2021. A survey on deep multimodal learning for computer vision: advances trends applications and datasets. Visual Computing 10 (June 2021) 1--32."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2006.01.017"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219963"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (June","author":"Chen C.","year":"2019","unstructured":"C. Chen, S. Rosa, Y. Miao, C.X. Lu, W. Wu, A. Markham, and N. Trigoni. 2019. Selective sensor fusion for neural visual-inertial odometry. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (June 2019), 10542--10551."},{"key":"e_1_3_2_1_6_1","volume-title":"Audio-Visual Multimodal Fusion for Bio-metric Person Authentication and Liveness Verification. in Proceedings of the 2005 NICTA-HCSNet Multimodal User Interaction Workshop -","volume":"57","author":"Chetty Girija","year":"2006","unstructured":"Girija Chetty and Michael Wagner. 2006. Audio-Visual Multimodal Fusion for Bio-metric Person Authentication and Liveness Verification. in Proceedings of the 2005 NICTA-HCSNet Multimodal User Interaction Workshop - Volume 57 (September 2006), 17--24."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the International Conference on Audio- and Video-Based Biometric Person Authentication (June","author":"Choudhury Tanzeem","year":"1998","unstructured":"Tanzeem Choudhury, Brian Clarkson, Tony Jebara, and Alex Pentland. 1998. Multimodal Person Recognition using Unconstrained Audio and Video. in Proceedings of the International Conference on Audio- and Video-Based Biometric Person Authentication (June 1998), 176--181."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of APSIPA, Annual Summit and Conference (December","author":"Das R. K.","year":"2020","unstructured":"R. K. Das, R. Tao, J. Yang, W. Rao, C. Yu, and H. Li. 2020. HLT-NUS submission for 2019 NIST multimedia speaker recognition evaluation. in Proceedings of APSIPA, Annual Summit and Conference (December 2020), 605--609."},{"key":"e_1_3_2_1_9_1","volume-title":"Language Modeling with Gated Convolutional Networks. (August","author":"Dauphin Yann N.","year":"2017","unstructured":"Yann N. Dauphin, Angela Fan, Michael Auli, and David Grangier. 2017. Language Modeling with Gated Convolutional Networks. (August 2017), 933--941."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the International Conference on Learning Representations (May","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. in Proceedings of the International Conference on Learning Representations (May 2021), 1--10."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682773"},{"key":"e_1_3_2_1_12_1","volume-title":"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications. CoRR abs\/1704.04861","author":"Howard Andrew G.","year":"2017","unstructured":"Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, and Hartwig Adam. 2017. MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications. CoRR abs\/1704.04861 (2017). arXiv:1704.04861 http:\/\/arxiv.org\/abs\/1704.04861"},{"key":"e_1_3_2_1_13_1","volume-title":"A Multimodal Sensor Fusion Framework Robust to Missing Modalities for Person Recognition. arXiv preprint arXiv:2210.10972, to Appear in ACM Multimedia Asia","author":"John Vijay","year":"2022","unstructured":"Vijay John and Yasutomo Kawanishi. 2022. A Multimodal Sensor Fusion Framework Robust to Missing Modalities for Person Recognition. arXiv preprint arXiv:2210.10972, to Appear in ACM Multimedia Asia, 2022 (October 2022)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-08628-9"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-006-6655-0"},{"key":"e_1_3_2_1_16_1","volume-title":"Video Face Recognition with Audio-Visual Aggregation Network. in Proceedings of the International Conference on Neural Information Processing) (December","author":"Li Qinbo","year":"2021","unstructured":"Qinbo Li, Qing Wan, Sang-Heon Lee, and Yoonsuck Choe. 2021. Video Face Recognition with Audio-Visual Aggregation Network. in Proceedings of the International Conference on Neural Information Processing) (December 2021), 150--161."},{"key":"e_1_3_2_1_17_1","volume-title":"Russo","author":"Livingstone Steven R.","year":"2018","unstructured":"Steven R. Livingstone and Frank A. Russo. 2018. The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English. PLOS ONE 13, 5 (5 2018), 1--35."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01764"},{"key":"e_1_3_2_1_19_1","volume-title":"SMIL: Multimodal Learning with Severely Missing Modality. CoRR abs\/2103.05677","author":"Ma Mengmeng","year":"2021","unstructured":"Mengmeng Ma, Jian Ren, Long Zhao, Sergey Tulyakov, Cathy Wu, and Xi Peng. 2021. SMIL: Multimodal Learning with Severely Missing Modality. CoRR abs\/2103.05677 (2021)."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the Digital Image Computing: Techniques and Applications (December","author":"Nawaz S.","year":"2019","unstructured":"S. Nawaz, M. K. Janjua, I. Gallo, A. Mahmood, and A. Calefati. 2019. Deep latent space learning for cross-modal mapping of audio and visual signals. in Proceedings of the Digital Image Computing: Techniques and Applications (December 2019), 1--7."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3395035.3425202"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016892"},{"key":"e_1_3_2_1_23_1","volume-title":"Emotion Recognition System from Speech and Visual Information based on Convolutional Neural Networks. arXiv abs\/2003.00351","author":"Ristea Nicolae-Catalin","year":"2020","unstructured":"Nicolae-Catalin Ristea, Liviu-Cristian Dutu, and Anamaria Radoi. 2020. Emotion Recognition System from Speech and Visual Information based on Convolutional Neural Networks. arXiv abs\/2003.00351 (2020)."},{"key":"e_1_3_2_1_24_1","volume-title":"The 2019 NIST Audio-Visual Speaker Recognition Evaluation. in Proceedings of the Speaker and Language Recognition Workshop: Odyssey 2020 (November","author":"Sadjadi Seyed","year":"2020","unstructured":"Seyed Sadjadi, Craig Greenberg, Elliot Singer, Douglas Olson, Lisa Mason, and Jaime Hernandez-Cordero. 2020. The 2019 NIST Audio-Visual Speaker Recognition Evaluation. in Proceedings of the Speaker and Language Recognition Workshop: Odyssey 2020 (November 2020), 266--272."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1142\/S0218001417560055"},{"key":"e_1_3_2_1_26_1","volume-title":"Audio-Visual Person Recognition in Multimedia Data From the Iarpa Janus Program. in Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (April","author":"Sell Gregory","year":"2018","unstructured":"Gregory Sell, Kevin Duh, David Snyder, Dave Etter, and Daniel Garcia-Romero. 2018. Audio-Visual Person Recognition in Multimedia Data From the Iarpa Janus Program. in Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (April 2018), 3031--3035."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of SPIE 5405","author":"Singh Saurabh","year":"2004","unstructured":"Saurabh Singh, Aglika Gyaourova, George Bebis, and Ioannis Pavlidis. 2004. Infrared and visible image fusion for face recognition. in Proceedings of SPIE 5405, Biometric Technology for Human Identification 5404 (August 2004), 585 -- 596."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1814"},{"key":"e_1_3_2_1_29_1","first-page":"1","article-title":"Audio-Visual Person Recognition Using Deep Convolutional Neural Networks","volume":"8","author":"Vegad Sagar","year":"2017","unstructured":"Sagar Vegad, Harshita Pathak Rajendra Patel, Hanqi Zhuang, and Mehul R. Naik. 2017. Audio-Visual Person Recognition Using Deep Convolutional Neural Networks. Journal of biometrics & biostatistics 8 (2017), 1--7.","journal-title":"Journal of biometrics & biostatistics"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380000"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the International Conference on Learning Representations (May","author":"Wen Y.","year":"2019","unstructured":"Y. Wen, M. A. Ismail, W. Liu, B. Raj, , and R. Singh. 2019. Disjoint mapping network for cross-modal matching of voices and faces. in Proceedings of the International Conference on Learning Representations (May 2019), 1--17."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413538"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.203"}],"event":{"name":"MMSys '23: 14th Conference on ACM Multimedia Systems","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGCOMM ACM Special Interest Group on Data Communication","SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing"],"location":"Vancouver BC Canada","acronym":"MMSys '23"},"container-title":["Proceedings of the 14th ACM Multimedia Systems Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3587819.3590989","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3587819.3590989","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:08:01Z","timestamp":1750183681000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3587819.3590989"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,7]]},"references-count":33,"alternative-id":["10.1145\/3587819.3590989","10.1145\/3587819"],"URL":"https:\/\/doi.org\/10.1145\/3587819.3590989","relation":{},"subject":[],"published":{"date-parts":[[2023,6,7]]},"assertion":[{"value":"2023-06-08","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}