{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,11,11]],"date-time":"2023-11-11T00:17:28Z","timestamp":1699661848791},"reference-count":31,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2022,6,8]],"date-time":"2022-06-08T00:00:00Z","timestamp":1654646400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,6,8]],"date-time":"2022-06-08T00:00:00Z","timestamp":1654646400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,9]]},"DOI":"10.1007\/s10772-021-09956-3","type":"journal-article","created":{"date-parts":[[2022,6,8]],"date-time":"2022-06-08T18:30:57Z","timestamp":1654713057000},"page":"599-608","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A comparison of neural-based visual recognisers for speech activity detection"],"prefix":"10.1007","volume":"26","author":[{"given":"Sajjadali","family":"Raza","sequence":"first","affiliation":[]},{"given":"Heriberto","family":"Cuay\u00e1huitl","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,6,8]]},"reference":[{"issue":"2","key":"9956_CR1","doi-asserted-by":"publisher","first-page":"265","DOI":"10.1109\/JSTSP.2019.2901195","volume":"13","author":"I Ariav","year":"2019","unstructured":"Ariav, I., & Cohen, I. (2019). An end-to-end multimodal voice activity detection using wavenet encoder and residual networks. IEEE Journal of Selected Topics in Signal Processing, 13(2), 265\u2013274.","journal-title":"IEEE Journal of Selected Topics in Signal Processing"},{"key":"9956_CR2","unstructured":"Braun, S. (2018). LSTM benchmarks for deep learning frameworks. arXiv preprint arXiv:180601818"},{"key":"9956_CR3","doi-asserted-by":"crossref","unstructured":"Cho, K., van Merrienboer, B., G\u00fcl\u00e7ehre, \u00c7., Bahdanau, D., Bougares, F., Schwenk, H., Bengio, Y. (2014). Learning phrase representations using RNN encoder-decoder for statistical machine translation. In Moschitti, A., Pang, B., Daelemans, W. (Eds.) Conference on empirical methods in natural language processing (EMNLP), ACL","DOI":"10.3115\/v1\/D14-1179"},{"key":"9956_CR4","doi-asserted-by":"crossref","unstructured":"Chollet, F. (2017). Xception: Deep learning with depthwise separable convolutions. IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2017.195"},{"key":"9956_CR5","unstructured":"Chollet. F., et\u00a0al. (2015) Recurrent neural networks (rnn) with keras. https:\/\/www.tensorflow.org\/guide\/keras\/rnn"},{"issue":"5","key":"9956_CR6","doi-asserted-by":"publisher","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke, M., Barker, J., Cunningham, S., & Shao, X. (2006). An audio-visual corpus for speech perception and automatic speech recognition. The Journal of the Acoustical Society of America, 120(5), 2421\u20132424.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"9956_CR7","doi-asserted-by":"crossref","unstructured":"Donahue, J., Anne\u00a0Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T. (2015). Long-term recurrent convolutional networks for visual recognition and description. In: IEEE conference on computer vision and pattern recognition","DOI":"10.21236\/ADA623249"},{"key":"9956_CR8","doi-asserted-by":"crossref","unstructured":"Filonenko, A., Kurnianggoro, L., Jo, K.H. (2017). Comparative study of modern convolutional neural networks for smoke detection on image data. In: International Conference on Human System Interactions (HSI), IEEE","DOI":"10.1109\/HSI.2017.8004998"},{"key":"9956_CR9","unstructured":"Goodfellow, I., Bengio, Y., Courville, A. (2016). Deep Learning. MIT Press, http:\/\/www.deeplearningbook.org"},{"issue":"8","key":"9956_CR10","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"key":"9956_CR11","unstructured":"Ioffe, S., Szegedy, C. (2015). Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:150203167"},{"issue":"3","key":"9956_CR12","doi-asserted-by":"publisher","first-page":"183","DOI":"10.1007\/s12193-015-0187-2","volume":"9","author":"B Joosten","year":"2015","unstructured":"Joosten, B., Postma, E. O., & Krahmer, E. (2015). Voice activity detection based on facial movement. Journal of Multimodal User Interfaces, 9(3), 183\u2013193.","journal-title":"Journal of Multimodal User Interfaces"},{"key":"9956_CR13","unstructured":"Jozefowicz, R., Zaremba, W., Sutskever, I. (2015). An empirical exploration of recurrent network architectures. In: International conference on machine learning"},{"key":"9956_CR14","unstructured":"Kingma, D. P., Ba, J. (2014). Adam: A method for stochastic optimization. arXiv:14126980"},{"key":"9956_CR15","unstructured":"Le Cornu, T., Milner, B. (2015). Voicing classification of visual speech using convolutional neural networks. In: FAAVSP-the 1st joint conference on facial analysis, animation and auditory-visual speech processing"},{"key":"9956_CR16","unstructured":"Li, A., Zheng, C., Li, X. (2019). Convolutional recurrent neural network based progressive learning for monaural speech enhancement. arXiv:190810768"},{"key":"9956_CR17","doi-asserted-by":"crossref","unstructured":"Lim, W., Jang, D., Lee, T. (2016). Speech emotion recognition using convolutional and recurrent neural networks. In: Asia-Pacific Signal and Inf. Proceedings of the association annual summit and conference (APSIPA).","DOI":"10.1109\/APSIPA.2016.7820699"},{"key":"9956_CR18","doi-asserted-by":"crossref","unstructured":"Sanderson, C., Lovell, B. C. (2009). Multi-region probabilistic histograms for robust and scalable identity inference. In: International conference on biometrics, Springer.","DOI":"10.1007\/978-3-642-01793-3_21"},{"key":"9956_CR19","doi-asserted-by":"publisher","first-page":"9017","DOI":"10.1109\/ACCESS.2018.2800728","volume":"6","author":"A Sehgal","year":"2018","unstructured":"Sehgal, A., & Kehtarnavaz, N. (2018). A convolutional neural network smartphone app for real-time voice activity detection. IEEE Access, 6, 9017\u20139026.","journal-title":"IEEE Access"},{"key":"9956_CR20","doi-asserted-by":"crossref","unstructured":"Shahid, M., Beyan, C., Murino, V. (2019). Voice activity detection by upper body motion analysis and unsupervised domain adaptation. In: IEEE\/CVF international conference on computer vision workshops.","DOI":"10.1109\/ICCVW.2019.00159"},{"key":"9956_CR21","doi-asserted-by":"crossref","unstructured":"Sharma, T., Aralikatti, R., Margam, D. K., Thanda, A., Roy, S., Kandala, P.A., Venkatesan, S. M. (2019). Real time online visual end point detection using unidirectional LSTM. Interspeech.","DOI":"10.21437\/Interspeech.2019-3253"},{"key":"9956_CR22","unstructured":"Simonyan, K., Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:14091556"},{"key":"9956_CR23","unstructured":"Soh, M. (2016). Learning CNN-LSTM architectures for image caption generation. Technical Report: Stanford University."},{"issue":"1","key":"9956_CR24","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., & Salakhutdinov, R. (2014). Dropout: a simple way to prevent neural networks from overfitting. The Journal of Machine Learning Research, 15(1), 1929\u20131958.","journal-title":"The Journal of Machine Learning Research"},{"key":"9956_CR25","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. (2016). Rethinking the inception architecture for computer vision. In: IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2016.308"},{"key":"9956_CR26","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2019.07.003","author":"F Tao","year":"2019","unstructured":"Tao, F., & Busso, C. (2019). End-to-end audiovisual speech activity detection with bimodal recurrent neural models. Speech Communication. https:\/\/doi.org\/10.1016\/j.specom.2019.07.003.","journal-title":"Speech Communication"},{"key":"9956_CR27","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K. (2015). Sequence to sequence-video to text. In: IEEE international conference on computer vision.","DOI":"10.1109\/ICCV.2015.515"},{"key":"9956_CR28","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D. (2015). Show and tell: A neural image caption generator. In: IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"9956_CR29","doi-asserted-by":"crossref","unstructured":"Wang, B., Wang, X. (2019). Are you speaking: Real-time speech activity detection via landmark pooling network. In: 2019 14th IEEE international conference on automatic face & gesture recognition (FG 2019).","DOI":"10.1109\/FG.2019.8756549"},{"key":"9956_CR30","doi-asserted-by":"crossref","unstructured":"Wang, J., Yang, Y., Mao, J., Huang, Z., Huang, C., Xu, W. (2016). CNN-RNN: A unified framework for multi-label image classification. In IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2016.251"},{"key":"9956_CR31","unstructured":"Zhang, Y., Sun, X., Ma, S., Yang, Y., Ren, X. (2017). Does higher order LSTM have better accuracy for segmenting and labeling sequence data? arXiv:171108231"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-021-09956-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-021-09956-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-021-09956-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,10]],"date-time":"2023-11-10T14:09:53Z","timestamp":1699625393000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-021-09956-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,8]]},"references-count":31,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,9]]}},"alternative-id":["9956"],"URL":"https:\/\/doi.org\/10.1007\/s10772-021-09956-3","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"value":"1381-2416","type":"print"},{"value":"1572-8110","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,6,8]]},"assertion":[{"value":"30 July 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 December 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 June 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}