{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T03:43:52Z","timestamp":1779335032137,"version":"3.51.4"},"reference-count":45,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"10","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2019,10,1]]},"DOI":"10.1587\/transinf.2018edp7383","type":"journal-article","created":{"date-parts":[[2019,10,1]],"date-time":"2019-10-01T02:43:21Z","timestamp":1569897801000},"page":"2033-2042","source":"Crossref","is-referenced-by-count":17,"title":["Cross-Domain Deep Feature Combination for Bird Species Classification with Audio-Visual Data"],"prefix":"10.1587","volume":"E102.D","author":[{"given":"Naranchimeg","family":"BOLD","sequence":"first","affiliation":[{"name":"Graduate School of Engineering, Department of Design and Media Technology, Iwate University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chao","family":"ZHANG","sequence":"additional","affiliation":[{"name":"Graduate School of Engineering, Information Science, University of Fukui"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Takuya","family":"AKASHI","sequence":"additional","affiliation":[{"name":"Graduate School of Engineering, Department of Design and Media Technology, Iwate University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] N. Zhang, J. Donahue, R. Girshick, and T. Darrell, \u201cPart-based r-cnns for fine-grained category detection,\u201d European conference on computer vision, vol.8689, pp.834-849, Springer, 2014. 10.1007\/978-3-319-10590-1_54","DOI":"10.1007\/978-3-319-10590-1_54"},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] S. Branson, G. Van Horn, S. Belongie, and P. Perona, \u201cBird species categorization using pose normalized deep convolutional nets,\u201d arXiv preprint arXiv:1406.2952, 2014.","DOI":"10.5244\/C.28.87"},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] Y. Cui, F. Zhou, Y. Lin, and S. Belongie, \u201cFine-grained categorization and dataset bootstrapping using deep metric learning with humans in the loop,\u201d Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp.1153-1162, 2016. 10.1109\/cvpr.2016.130","DOI":"10.1109\/CVPR.2016.130"},{"key":"4","doi-asserted-by":"publisher","unstructured":"[4] E. Gavves, B. Fernando, C.G.M. Snoek, A.W. Smeulders, and T. Tuytelaars, \u201cLocal alignments for fine-grained categorization,\u201d International Journal of Computer Vision, vol.111, no.2, pp.191-212, 2015. 10.1007\/s11263-014-0741-5","DOI":"10.1007\/s11263-014-0741-5"},{"key":"5","unstructured":"[5] P. Guo and R. Farrell, \u201cFine-grained visual categorization using pairs: Pose and appearance integration for recognizing subcategories,\u201d arXiv preprint arXiv:1801.09057, 2018."},{"key":"6","unstructured":"[6] V. Lebedev, A. Babenko, and V. Lempitsky, \u201cImpostor networks for fast fine-grained recognition,\u201d arXiv preprint arXiv:1806.05217, 2018."},{"key":"7","doi-asserted-by":"crossref","unstructured":"[7] X. He and Y. Peng, \u201cFine-grained image classification via combining vision and language,\u201d Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp.5994-6002, 2017. 10.1109\/cvpr.2017.775","DOI":"10.1109\/CVPR.2017.775"},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] A. Owens, J. Wu, J.H. McDermott, W.T. Freeman, and A. Torralba, \u201cAmbient sound provides supervision for visual learning,\u201d European Conference on Computer Vision, vol.9905, pp.801-816, Springer, 2016. 10.1007\/978-3-319-46448-0_48","DOI":"10.1007\/978-3-319-46448-0_48"},{"key":"9","unstructured":"[9] S. Kahl, T. Wilhelm-Stein, H. Hussein, H. Klinck, D. Kowerko, M. Ritter, and M. Eibl, \u201cLarge-scale bird sound classification using convolutional neural networks,\u201d Working notes of CLEF, 2017."},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] E. Cakir, S. Adavanne, G. Parascandolo, K. Drossos, and T. Virtanen, \u201cConvolutional recurrent neural networks for bird audio detection,\u201d Signal Processing Conference (EUSIPCO), 2017 25th European, pp.1744-1748, IEEE, 2017. 10.23919\/eusipco.2017.8081508","DOI":"10.23919\/EUSIPCO.2017.8081508"},{"key":"11","doi-asserted-by":"publisher","unstructured":"[11] N. Takahashi, M. Gygli, and L. Van Gool, \u201cAenet: Learning deep audio features for video analysis,\u201d IEEE Trans. Multimedia, vol.20, no.3, pp.513-524, 2018. 10.1109\/tmm.2017.2751969","DOI":"10.1109\/TMM.2017.2751969"},{"key":"12","unstructured":"[12] K.J. Piczak, \u201cRecognizing bird species in audio recordings using deep convolutional neural networks.,\u201d CLEF (Working Notes), pp.534-543, 2016."},{"key":"13","unstructured":"[13] N. Srivastava and R.R. Salakhutdinov, \u201cMultimodal learning with deep boltzmann machines,\u201d Advances in neural information processing systems, pp.2222-2230, 2012."},{"key":"14","unstructured":"[14] J. Ngiam, A. Khosla, M. Kim, J. Nam, H. Lee, and A.Y. Ng, \u201cMultimodal deep learning,\u201d Proceedings of the 28th international conference on machine learning (ICML-11), pp.689-696, 2011."},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] E. Tatulli and T. Hueber, \u201cFeature extraction using multimodal convolutional neural networks for visual speech recognition,\u201d Acoustics, Speech and Signal Processing (ICASSP), 2017 IEEE International Conference on, pp.2971-2975, IEEE, 2017. 10.1109\/icassp.2017.7952701","DOI":"10.1109\/ICASSP.2017.7952701"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] A. Eitel, J.T. Springenberg, L. Spinello, M. Riedmiller, and W. Burgard, \u201cMultimodal deep learning for robust rgb-d object recognition,\u201d Intelligent Robots and Systems (IROS), 2015 IEEE\/RSJ International Conference on, pp.681-687, IEEE, 2015. 10.1109\/iros.2015.7353446","DOI":"10.1109\/IROS.2015.7353446"},{"key":"17","unstructured":"[17] K. Simonyan and A. Zisserman, \u201cTwo-stream convolutional networks for action recognition in videos,\u201d Advances in neural information processing systems, pp.568-576, 2014."},{"key":"18","doi-asserted-by":"publisher","unstructured":"[18] K. Noda, Y. Yamaguchi, K. Nakadai, H.G. Okuno, and T. Ogata, \u201cAudio-visual speech recognition using deep learning,\u201d Applied Intelligence, vol.42, no.4, pp.722-737, 2015. 10.1007\/s10489-014-0629-7","DOI":"10.1007\/s10489-014-0629-7"},{"key":"19","doi-asserted-by":"crossref","unstructured":"[19] H. Meutzner, N. Ma, R. Nickel, C. Schymura, and D. Kolossa, \u201cImproving audio-visual speech recognition using deep neural networks with dynamic stream reliability estimates,\u201d Acoustics, Speech and Signal Processing (ICASSP), 2017 IEEE International Conference on, pp.5320-5324, IEEE, 2017. 10.1109\/icassp.2017.7953172","DOI":"10.1109\/ICASSP.2017.7953172"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] A. Torfi, S.M. Iranmanesh, N.M. Nasrabadi, and J. Dawson, \u201cCoupled 3d convolutional neural networks for audio-visual recognition,\u201d arXiv preprint, 2017.","DOI":"10.1109\/ACCESS.2017.2761539"},{"key":"21","doi-asserted-by":"crossref","unstructured":"[21] J. Huang and B. Kingsbury, \u201cAudio-visual deep learning for noise robust speech recognition,\u201d Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on, pp.7596-7599, IEEE, 2013. 10.1109\/icassp.2013.6639140","DOI":"10.1109\/ICASSP.2013.6639140"},{"key":"22","unstructured":"[22] C. Wah, S. Branson, P. Welinder, P. Perona, and S. Belongie, \u201cThe caltech-ucsd birds-200-2011 dataset,\u201d Tech. Rep. CNS-TR-2011-001, California Institute of Technology, 2011."},{"key":"23","doi-asserted-by":"publisher","unstructured":"[23] T. Baltru\u0161aitis, C. Ahuja, and L.-P. Morency, \u201cMultimodal machine learning: A survey and taxonomy,\u201d IEEE Trans. Pattern Anal. Mach. Intell., vol.41, no.2, pp.423-443, 2019. 10.1109\/tpami.2018.2798607","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"24","doi-asserted-by":"publisher","unstructured":"[24] P.K. Atrey, M.A. Hossain, A. El Saddik, and M.S. Kankanhalli, \u201cMultimodal fusion for multimedia analysis: a survey,\u201d Multimedia systems, vol.16, no.6, pp.345-379, 2010. 10.1007\/s00530-010-0182-0","DOI":"10.1007\/s00530-010-0182-0"},{"key":"25","doi-asserted-by":"crossref","unstructured":"[25] V. Ranjan, N. Rasiwasia, and C.V. Jawahar, \u201cMulti-label cross-modal retrieval,\u201d Proceedings of the IEEE International Conference on Computer Vision, pp.4094-4102, 2015. 10.1109\/iccv.2015.466","DOI":"10.1109\/ICCV.2015.466"},{"key":"26","doi-asserted-by":"crossref","unstructured":"[26] Y. Cao, M. Long, J. Wang, Q. Yang, and P.S. Yu, \u201cDeep visual-semantic hashing for cross-modal retrieval,\u201d Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp.1445-1454, ACM, 2016. 10.1145\/2939672.2939812","DOI":"10.1145\/2939672.2939812"},{"key":"27","doi-asserted-by":"publisher","unstructured":"[27] P. Tzirakis, G. Trigeorgis, M.A. Nicolaou, B.W. Schuller, and S. Zafeiriou, \u201cEnd-to-end multimodal emotion recognition using deep neural networks,\u201d IEEE J. Sel. Topics Signal Process., vol.11, no.8, pp.1301-1309, 2017. 10.1109\/jstsp.2017.2764438","DOI":"10.1109\/JSTSP.2017.2764438"},{"key":"28","unstructured":"[28] R. Collobert, J. Weston, L. Bottou, M. Karlen, K. Kavukcuoglu, and P. Kuksa, \u201cNatural language processing (almost) from scratch,\u201d Journal of Machine Learning Research, vol.12, no.Aug, pp.2493-2537, 2011."},{"key":"29","unstructured":"[29] X. Zhang, J. Zhao, and Y. LeCun, \u201cCharacter-level convolutional networks for text classification,\u201d Advances in neural information processing systems, pp.649-657, 2015."},{"key":"30","doi-asserted-by":"crossref","unstructured":"[30] Y. Kim, \u201cConvolutional neural networks for sentence classification,\u201d arXiv preprint arXiv:1408.5882, 2014.","DOI":"10.3115\/v1\/D14-1181"},{"key":"31","unstructured":"[31] Q.V. Le, \u201cBuilding high-level features using large scale unsupervised learning,\u201d Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on, pp.8595-8598, IEEE, 2013. 10.1109\/icassp.2013.6639343"},{"key":"32","unstructured":"[32] Y. LeCun, B.E. Boser, J.S. Denker, D. Henderson, R.E. Howard, W.E. Hubbard, and L.D. Jackel, \u201cHandwritten digit recognition with a back-propagation network,\u201d Advances in neural information processing systems, pp.396-404, 1990."},{"key":"33","doi-asserted-by":"publisher","unstructured":"[33] Y. LeCun, B. Boser, J.S. Denker, D. Henderson, R.E. Howard, W. Hubbard, and L.D. Jackel, \u201cBackpropagation applied to handwritten zip code recognition,\u201d Neural computation, vol.1, no.4, pp.541-551, 1989. 10.1162\/neco.1989.1.4.541","DOI":"10.1162\/neco.1989.1.4.541"},{"key":"34","doi-asserted-by":"publisher","unstructured":"[34] G. Hinton, L. Deng, D. Yu, G. Dahl, A.-R. Mohamed, N. Jaitly, A. Senior, V. Vanhoucke, P. Nguyen, T. Sainath, and B. Kingsbury, \u201cDeep neural networks for acoustic modeling in speech recognition: The shared views of four research groups,\u201d IEEE Signal Process. Mag., vol.29, no.6, pp.82-97, 2012. 10.1109\/msp.2012.2205597","DOI":"10.1109\/MSP.2012.2205597"},{"key":"35","doi-asserted-by":"crossref","unstructured":"[35] Y. Petetin, C. Laroche, and A. Mayoue, \u201cDeep neural networks for audio scene recognition.,\u201d EUSIPCO, pp.125-129, 2015. 10.1109\/eusipco.2015.7362358","DOI":"10.1109\/EUSIPCO.2015.7362358"},{"key":"36","unstructured":"[36] A. Krizhevsky, I. Sutskever, and G.E. Hinton, \u201cImagenet classification with deep convolutional neural networks,\u201d Advances in neural information processing systems, pp.1097-1105, 2012."},{"key":"37","doi-asserted-by":"crossref","unstructured":"[37] C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, and A. Rabinovich, \u201cGoing deeper with convolutions,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.1-9, 2015. 10.1109\/cvpr.2015.7298594","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"38","doi-asserted-by":"crossref","unstructured":"[38] K. He, X. Zhang, S. Ren, and J. Sun, \u201cDeep residual learning for image recognition,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.770-778, 2016. 10.1109\/cvpr.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"39","doi-asserted-by":"crossref","unstructured":"[39] T.N. Sainath, A.-R. Mohamed, B. Kingsbury, and B. Ramabhadran, \u201cDeep convolutional neural networks for lvcsr,\u201d Acoustics, speech and signal processing (ICASSP), 2013 IEEE international conference on, pp.8614-8618, IEEE, 2013. 10.1109\/icassp.2013.6639347","DOI":"10.1109\/ICASSP.2013.6639347"},{"key":"40","doi-asserted-by":"publisher","unstructured":"[40] O. Abdel-Hamid, A.-R. Mohamed, H. Jiang, L. Deng, G. Penn, and D. Yu, \u201cConvolutional neural networks for speech recognition,\u201d IEEE\/ACM Trans. Audio, Speech, Language Process., vol.22, no.10, pp.1533-1545, 2014. 10.1109\/taslp.2014.2339736","DOI":"10.1109\/TASLP.2014.2339736"},{"key":"41","doi-asserted-by":"crossref","unstructured":"[41] M. Espi, M. Fujimoto, K. Kinoshita, and T. Nakatani, \u201cExploiting spectro-temporal locality in deep learning based acoustic event detection,\u201d EURASIP Journal on Audio, Speech, and Music Processing, vol.2015, no.1, p.26, 2015. 10.1186\/s13636-015-0069-2","DOI":"10.1186\/s13636-015-0069-2"},{"key":"42","doi-asserted-by":"crossref","unstructured":"[42] L. Ma, Z. Lu, L. Shang, and H. Li, \u201cMultimodal convolutional neural networks for matching image and sentence,\u201d Proceedings of the IEEE international conference on computer vision, pp.2623-2631, 2015. 10.1109\/iccv.2015.301","DOI":"10.1109\/ICCV.2015.301"},{"key":"43","doi-asserted-by":"crossref","unstructured":"[43] Y. Jia, E. Shelhamer, J. Donahue, S. Karayev, J. Long, R. Girshick, S. Guadarrama, and T. Darrell, \u201cCaffe: Convolutional architecture for fast feature embedding,\u201d Proceedings of the 22nd ACM international conference on Multimedia, pp.675-678, ACM, 2014. 10.1145\/2647868.2654889","DOI":"10.1145\/2647868.2654889"},{"key":"44","doi-asserted-by":"crossref","unstructured":"[44] H. Han, W.-Y. Wang, and B.H. Mao, \u201cBorderline-smote: a new over-sampling method in imbalanced data sets learning,\u201d International conference on intelligent computing, vol.3644, pp.878-887, Springer, 2005. 10.1007\/11538059_91","DOI":"10.1007\/11538059_91"},{"key":"45","unstructured":"[45] A. Chowdhury and J. Alspector, \u201cData duplication: an imbalance problem?,\u201d ICML&apos;2003 Workshop on Learning from Imbalanced Data Sets (II), Washington, DC, 2003."}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E102.D\/10\/E102.D_2018EDP7383\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,10,5]],"date-time":"2019-10-05T03:26:11Z","timestamp":1570245971000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E102.D\/10\/E102.D_2018EDP7383\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,1]]},"references-count":45,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2019]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2018edp7383","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,10,1]]}}}