{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T22:01:10Z","timestamp":1743112870407,"version":"3.40.3"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030602758"},{"type":"electronic","value":"9783030602765"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-60276-5_1","type":"book-chapter","created":{"date-parts":[[2020,10,4]],"date-time":"2020-10-04T07:02:44Z","timestamp":1601794964000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Lightweight CNN for Robust Voice Activity Detection"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4284-2743","authenticated-orcid":false,"given":"Tanvirul","family":"Alam","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1517-5319","authenticated-orcid":false,"given":"Akib","family":"Khan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,9,29]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Chang, S.Y., et al.: Temporal modeling using dilated convolution and gating for voice-activity-detection. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5549\u20135553. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461921"},{"key":"1_CR2","doi-asserted-by":"crossref","unstructured":"Chuangsuwanich, E., Glass, J.: Robust voice activity detector for real world applications using harmonicity and modulation frequency. In: Twelfth Annual Conference of the International Speech Communication Association (2011)","DOI":"10.21437\/Interspeech.2011-676"},{"key":"1_CR3","unstructured":"pytorch cifar (2017). https:\/\/github.com\/kuangliu\/pytorch-cifar"},{"key":"1_CR4","doi-asserted-by":"publisher","first-page":"28","DOI":"10.1016\/j.asoc.2016.12.024","volume":"52","author":"YM Costa","year":"2017","unstructured":"Costa, Y.M., Oliveira, L.S., Silla Jr., C.N.: An evaluation of convolutional neural networks for music classification using spectrograms. Appl. Soft Comput. 52, 28\u201338 (2017)","journal-title":"Appl. Soft Comput."},{"key":"1_CR5","doi-asserted-by":"crossref","unstructured":"Eyben, F., Weninger, F., Squartini, S., Schuller, B.: Real-life voice activity detection with LSTM Recurrent Neural Networks and an application to Hollywood movies. In: 2013 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 483\u2013487. IEEE (2013)","DOI":"10.1109\/ICASSP.2013.6637694"},{"key":"1_CR6","unstructured":"Ghiasi, G., Lin, T.Y., Le, Q.V.: Dropblock: a regularization method for convolutional networks. In: Advances in Neural Information Processing Systems, pp. 10727\u201310737 (2018)"},{"issue":"3","key":"1_CR7","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TASL.2010.2052803","volume":"19","author":"PK Ghosh","year":"2011","unstructured":"Ghosh, P.K., Tsiartas, A., Narayanan, S.S.: Robust voice activity detection using long-term signal variability. IEEE Trans. Speech Audio Process. 19(3), 600\u2013613 (2011)","journal-title":"IEEE Trans. Speech Audio Process."},{"issue":"12","key":"1_CR8","doi-asserted-by":"publisher","first-page":"e0144610","DOI":"10.1371\/journal.pone.0144610","volume":"10","author":"T Giannakopoulos","year":"2015","unstructured":"Giannakopoulos, T.: Pyaudioanalysis: an open-source python library for audio signal analysis. PloS One 10(12), e0144610 (2015)","journal-title":"PloS One"},{"key":"1_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"630","DOI":"10.1007\/978-3-319-46493-0_38","volume-title":"Computer Vision \u2013 ECCV 2016","author":"K He","year":"2016","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Identity mappings in deep residual networks. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 630\u2013645. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_38"},{"key":"1_CR10","doi-asserted-by":"crossref","unstructured":"Hershey, S., et al.: CNN architectures for large-scale audio classification. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 131\u2013135. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"1_CR11","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. In: NIPS Deep Learning and Representation Learning Workshop (2015), arxiv:1503.02531"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Hughes, T., Mierle, K.: Recurrent neural networks for voice activity detection. In: 2013 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 7378\u20137382. IEEE (2013)","DOI":"10.1109\/ICASSP.2013.6639096"},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Jung, Y., Kim, Y., Choi, Y., Kim, H.: Joint learning using denoising variational autoencoders for voice activity detection. In: INTERSPEECH, pp. 1210\u20131214 (2018)","DOI":"10.21437\/Interspeech.2018-1151"},{"key":"1_CR14","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. In: 3rd International Conference on Learning Representations (2015)"},{"issue":"4","key":"1_CR15","doi-asserted-by":"publisher","first-page":"777","DOI":"10.1109\/TASSP.1981.1163642","volume":"29","author":"L Lamel","year":"1981","unstructured":"Lamel, L., Rabiner, L., Rosenberg, A., Wilpon, J.: An improved endpoint detector for isolated word recognition. IEEE Trans. Acoust. Speech Signal Process. 29(4), 777\u2013785 (1981)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"1_CR16","doi-asserted-by":"crossref","unstructured":"Lin, R., Costello, C., Jankowski, C., Mruthyunjaya, V.: Optimizing voice activity detection for noisy conditions. In: Interspeech, pp. 2030\u20132034. ISCA (2019)","DOI":"10.21437\/Interspeech.2019-1776"},{"key":"1_CR17","unstructured":"McCloy, D.R., Souza, P.E., Wright, R.A., Haywood, J., Gehani, N., Rudolph, S.: The UW\/NU corpus (2013). http:\/\/depts.washington.edu\/phonlab\/resources\/uwnu\/ , version 1.0"},{"key":"1_CR18","doi-asserted-by":"crossref","unstructured":"McFee, B., Raffel, C., Liang, D., Ellis, D.P., McVicar, M., Battenberg, E., Nieto, O.: librosa: audio and music signal analysis in python. In: Proceedings of the 14th Python in Science Conference, vol. 8 (2015)","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"1_CR19","doi-asserted-by":"crossref","unstructured":"Ng, T., et al.: Developing a speech activity detection system for the DARPA RATS program. In: Thirteenth Annual Conference of the International Speech Communication Association (2012)","DOI":"10.21437\/Interspeech.2012-527"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Park, D.S., et al.: Specaugment: a simple data augmentation method for automatic speech recognition. In: Interspeech, pp. 2613\u20132617. ISCA (2019)","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"1_CR21","unstructured":"Paszke, A., et al.: Pytorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems 32, pp. 8024\u20138035. Curran Associates, Inc. (2019). http:\/\/papers.neurips.cc\/paper\/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf"},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Reddy, C.K.A., Beyrami, E., Pool, J., Cutler, R., Srinivasan, S., Gehrke, J.: A scalable noisy speech dataset and online subjective test framework. In: Interspeech, pp. 1816\u20131820. ISCA (2019)","DOI":"10.21437\/Interspeech.2019-3087"},{"issue":"3","key":"1_CR23","doi-asserted-by":"publisher","first-page":"197","DOI":"10.1109\/LSP.2013.2237903","volume":"20","author":"SO Sadjadi","year":"2013","unstructured":"Sadjadi, S.O., Hansen, J.H.: Unsupervised speech activity detection using voicing measures and perceptual spectral flux. IEEE Signal Process. Lett. 20(3), 197\u2013200 (2013)","journal-title":"IEEE Signal Process. Lett."},{"issue":"3","key":"1_CR24","doi-asserted-by":"publisher","first-page":"279","DOI":"10.1109\/LSP.2017.2657381","volume":"24","author":"J Salamon","year":"2017","unstructured":"Salamon, J., Bello, J.P.: Deep convolutional neural networks and data augmentation for environmental sound classification. IEEE Signal Process. Lett. 24(3), 279\u2013283 (2017)","journal-title":"IEEE Signal Process. Lett."},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Saon, G., Thomas, S., Soltau, H., Ganapathy, S., Kingsbury, B.: The IBM speech activity detection system for the DARPA RATS program. In: Interspeech, pp. 3497\u20133501. ISCA (2013)","DOI":"10.21437\/Interspeech.2013-264"},{"key":"1_CR26","doi-asserted-by":"publisher","first-page":"9017","DOI":"10.1109\/ACCESS.2018.2800728","volume":"6","author":"A Sehgal","year":"2018","unstructured":"Sehgal, A., Kehtarnavaz, N.: A convolutional neural network smartphone app for real-time voice activity detection. IEEE Access 6, 9017\u20139026 (2018)","journal-title":"IEEE Access"},{"key":"1_CR27","doi-asserted-by":"crossref","unstructured":"Shannon, M., Simko, G., Chang, S.Y., Parada, C.: Improved end-of-query detection for streaming speech recognition. In: Interspeech, pp. 1909\u20131913 (2017)","DOI":"10.21437\/Interspeech.2017-496"},{"key":"1_CR28","unstructured":"Snyder, D., Chen, G., Povey, D.: MUSAN: a music, speech, and noise corpus. CoRR abs\/1510.08484 (2015), arxiv:1510.08484"},{"issue":"1","key":"1_CR29","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res 15(1), 1929\u20131958 (2014)","journal-title":"J. Mach. Learn. Res"},{"key":"1_CR30","doi-asserted-by":"crossref","unstructured":"Thomas, S., Ganapathy, S., Saon, G., Soltau, H.: Analyzing convolutional neural networks for speech activity detection in mismatched acoustic conditions. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2519\u20132523. IEEE (2014)","DOI":"10.1109\/ICASSP.2014.6854054"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Tong, S., Gu, H., Yu, K.: A comparative study of robustness of deep learning approaches for VAD. In: 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5695\u20135699. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472768"},{"issue":"4","key":"1_CR32","doi-asserted-by":"publisher","first-page":"377","DOI":"10.1049\/ip-i-2.1992.0052","volume":"139","author":"R Tucker","year":"1992","unstructured":"Tucker, R.: Voice activity detection using a periodicity measure. IEEE Proc. I (Commun. Speech Vision) 139(4), 377\u2013380 (1992)","journal-title":"IEEE Proc. I (Commun. Speech Vision)"},{"issue":"2","key":"1_CR33","doi-asserted-by":"publisher","first-page":"180","DOI":"10.1049\/el:20000192","volume":"36","author":"KH Woo","year":"2000","unstructured":"Woo, K.H., Yang, T.Y., Park, K.J., Lee, C.: Robust voice activity detection algorithm for estimating noise spectrum. Electron. Lett. 36(2), 180\u2013181 (2000)","journal-title":"Electron. Lett."},{"issue":"8","key":"1_CR34","doi-asserted-by":"publisher","first-page":"2624","DOI":"10.1109\/TASL.2011.2125953","volume":"19","author":"D Ying","year":"2011","unstructured":"Ying, D., Yan, Y., Dang, J., Soong, F.K.: Voice activity detection based on an unsupervised learning framework. IEEE Trans. Audio Speech Lang. Process. 19(8), 2624\u20132633 (2011)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"1_CR35","doi-asserted-by":"crossref","unstructured":"Zazo, R., Sainath, T.N., Simko, G., Parada, C.: Feature learning with raw-waveform CLDNNs for voice activity detection. In: Interspeech, pp. 3668\u20133672 (2016)","DOI":"10.21437\/Interspeech.2016-268"},{"key":"1_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, X.L., Wang, D.: Boosted deep neural networks and multi-resolution cochleagram features for voice activity detection. In: Fifteenth Annual Conference of the International Speech Communication Association (2014)","DOI":"10.21437\/Interspeech.2014-367"},{"key":"1_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, X.L., Wu, J.: Denoising deep neural networks based voice activity detection. In: 2013 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 853\u2013857. IEEE (2013)","DOI":"10.1109\/ICASSP.2013.6637769"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-60276-5_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,22]],"date-time":"2022-11-22T02:41:10Z","timestamp":1669084870000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-60276-5_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030602758","9783030602765"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-60276-5_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"29 September 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"St. Petersburg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Russia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 October 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 October 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/specom.nw.ru\/2020\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"160","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"65","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"41% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Due to the Corona pandemic SPECOM 2020 was held as a virtual event","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}