{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T19:01:07Z","timestamp":1773514867220,"version":"3.50.1"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2020,6,15]],"date-time":"2020-06-15T00:00:00Z","timestamp":1592179200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,6,15]],"date-time":"2020-06-15T00:00:00Z","timestamp":1592179200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Circuits Syst Signal Process"],"published-print":{"date-parts":[[2021,1]]},"DOI":"10.1007\/s00034-020-01468-w","type":"journal-article","created":{"date-parts":[[2020,6,15]],"date-time":"2020-06-15T03:22:33Z","timestamp":1592191353000},"page":"262-275","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["$$hf_0$$: A Hybrid Pitch Extraction Method for Multimodal Voice"],"prefix":"10.1007","volume":"40","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0781-7542","authenticated-orcid":false,"given":"Pradeep","family":"Rengaswamy","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"M. Gurunath","family":"Reddy","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"K. Sreenivasa","family":"Rao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pallab","family":"Dasgupta","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,6,15]]},"reference":[{"key":"1468_CR1","doi-asserted-by":"crossref","unstructured":"H. Ba, N. Yang, I. Demirkol, W. Heinzelman, BaNa: a hybrid approach for noise resilient pitch detection. In 2012 IEEE Statistical Signal Processing Workshop (SSP) (IEEE, 2012), pp 369\u2013372","DOI":"10.1109\/SSP.2012.6319706"},{"issue":"3","key":"1468_CR2","doi-asserted-by":"publisher","first-page":"1638","DOI":"10.1121\/1.2951592","volume":"124","author":"A Camacho","year":"2008","unstructured":"A. Camacho, J.G. Harris, A sawtooth waveform inspired pitch estimator for speech and music. J. Acoust. Soc. Am. 124(3), 1638\u20131652 (2008)","journal-title":"J. Acoust. Soc. Am."},{"key":"1468_CR3","doi-asserted-by":"crossref","unstructured":"W. Chu, A. Alwan, Reducing f0 frame error of f0 tracking algorithms under noisy conditions with an unvoiced\/voiced classification frontend. In 2009 IEEE International Conference on Acoustics, Speech and Signal Processing (IEEE, 2009), pp. 3969\u20133972","DOI":"10.1109\/ICASSP.2009.4960497"},{"key":"1468_CR4","doi-asserted-by":"publisher","first-page":"1917","DOI":"10.1121\/1.1458024","volume":"111","author":"A De Cheveign\u00e9","year":"2002","unstructured":"A. De Cheveign\u00e9, H. Kawahara, YIN, a fundamental frequency estimator for speech and music. J. Acoust. Soc. Am. 111, 1917\u20131930 (2002)","journal-title":"J. Acoust. Soc. Am."},{"key":"1468_CR5","doi-asserted-by":"crossref","unstructured":"T. Drugman, A. Alwan, Joint robust voicing detection and pitch estimation based on residual harmonics. In Twelfth Annual Conference of the International Speech Communication Association (2011)","DOI":"10.21437\/Interspeech.2011-519"},{"key":"1468_CR6","doi-asserted-by":"crossref","unstructured":"T. Drugman, T. Dutoit, Glottal closure and opening instant detection from speech signals. In Tenth Annual Conference of the International Speech Communication Association (2009)","DOI":"10.21437\/Interspeech.2009-47"},{"issue":"11","key":"1468_CR7","doi-asserted-by":"publisher","first-page":"1745","DOI":"10.1109\/LSP.2018.2874155","volume":"25","author":"T Drugman","year":"2018","unstructured":"T. Drugman, G. Huybrechts, V. Klimkov, A. Moinet, Traditional machine learning for pitch detection. IEEE Signal Process. Lett. 25(11), 1745\u20131749 (2018)","journal-title":"IEEE Signal Process. Lett."},{"issue":"3","key":"1468_CR8","doi-asserted-by":"publisher","first-page":"994","DOI":"10.1109\/TASL.2011.2170835","volume":"20","author":"T Drugman","year":"2011","unstructured":"T. Drugman, M. Thomas, J. Gudnason, P. Naylor, T. Dutoit, Detection of glottal closure instants from speech signals: a quantitative review. IEEE Trans. Audio Speech Lang. Process. 20(3), 994\u20131006 (2011)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"6","key":"1468_CR9","doi-asserted-by":"publisher","first-page":"1568","DOI":"10.1121\/1.387811","volume":"71","author":"H Duifhuis","year":"1982","unstructured":"H. Duifhuis, L.F. Willems, R.J. Sluyter, Measurement of pitch in speech: an implementation of Goldstein\u2019s theory of pitch perception. J. Acoust. Soc. Am. 71(6), 1568\u20131580 (1982)","journal-title":"J. Acoust. Soc. Am."},{"issue":"1","key":"1468_CR10","doi-asserted-by":"publisher","first-page":"102","DOI":"10.1109\/LSP.2012.2231675","volume":"20","author":"PN Garner","year":"2012","unstructured":"P.N. Garner, M. Cernak, P. Motlicek, A simple continuous pitch estimation algorithm. IEEE Signal Process. Lett. 20(1), 102\u2013105 (2012)","journal-title":"IEEE Signal Process. Lett."},{"issue":"1\u20132","key":"1468_CR11","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1016\/0378-5955(90)90170-T","volume":"47","author":"BR Glasberg","year":"1990","unstructured":"B.R. Glasberg, B.C.J. Moore, Derivation of auditory filter shapes from notched-noise data. Hear. Res. 47(1\u20132), 103\u2013138 (1990)","journal-title":"Hear. Res."},{"key":"1468_CR12","unstructured":"S. Gonzalez, M. Brookes, A pitch estimation filter robust to high levels of noise (PEFAC). In 2011 19th European Signal Processing Conference (IEEE, 2011), pp 451\u2013455"},{"key":"1468_CR13","doi-asserted-by":"crossref","unstructured":"K. Han, D.L. Wang, Neural networks for supervised pitch tracking in noise. In 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2014), pp. 1488\u20131492","DOI":"10.1109\/ICASSP.2014.6853845"},{"key":"1468_CR14","unstructured":"N. Henrich, Study of the glottal source in speech and singing: modeling and estimation, acoustic and electroglottographic measurements, perception. Universit\u00e9 Pierre et Marie Curie-Paris VI, Theses (2001)"},{"issue":"3","key":"1468_CR15","doi-asserted-by":"publisher","first-page":"1417","DOI":"10.1121\/1.1850031","volume":"117","author":"N Henrich","year":"2005","unstructured":"N. Henrich, C. d\u2019Alessandro, B. Doval, M. Castellengo, Glottal open quotient in singing: measurements and correlation with laryngeal mechanisms, vocal intensity, and fundamental frequency. J. Acoust. Soc. Am. 117(3), 1417\u20131430 (2005)","journal-title":"J. Acoust. Soc. Am."},{"issue":"1","key":"1468_CR16","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1121\/1.396427","volume":"83","author":"DJ Hermes","year":"1988","unstructured":"D.J. Hermes, Measurement of pitch by subharmonic summation. J. Acoust. Soc. Am. 83(1), 257\u2013264 (1988)","journal-title":"J. Acoust. Soc. Am."},{"key":"1468_CR17","unstructured":"S. Ioffe, C. Szegedy, Batch normalization: accelerating deep network training by reducing internal covariate shift. In Proceedings of the 32nd International Conference on International Conference on Machine Learning\u2014volume 37, JMLR.org, ICML\u201915 (2015), pp 448\u2013456. http:\/\/dl.acm.org\/citation.cfm?id=3045118.3045167"},{"key":"1468_CR18","doi-asserted-by":"crossref","unstructured":"H. Kawahara, H. Katayose, A. De Cheveign\u00e9, R.D. Patterson, Fixed point analysis of frequency to instantaneous frequency mapping for accurate estimation of F0 and periodicity (1999)","DOI":"10.21437\/Eurospeech.1999-613"},{"issue":"3\u20134","key":"1468_CR19","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1016\/S0167-6393(98)00085-5","volume":"27","author":"H Kawahara","year":"1999","unstructured":"H. Kawahara, I. Masuda-Katsuse, A. De Cheveigne, Restructuring speech representations using a pitch-adaptive time-frequency smoothing and an instantaneous-frequency-based F0 extraction: Possible role of a repetitive structure in sounds. Speech Commun. 27(3\u20134), 187\u2013207 (1999)","journal-title":"Speech Commun."},{"key":"1468_CR20","doi-asserted-by":"crossref","unstructured":"J.W. Kim, J. Salamon, P. Li, J.P. Bello, CREPE: a convolutional representation for pitch estimation. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2018), pp. 161\u2013165","DOI":"10.1109\/ICASSP.2018.8461329"},{"key":"1468_CR21","unstructured":"J. Kominek, A.W.Black, The CMU Arctic speech databases. In: Fifth ISCA workshop on speech synthesis (2004)"},{"key":"1468_CR22","doi-asserted-by":"crossref","unstructured":"S.G. Koolagudi, R. Reddy, J. Yadav, K.S. Rao, IITKGP-SEHSC: Hindi speech corpus for emotion analysis. In 2011 International conference on devices and communications (ICDeCom) (IEEE, 2011), pp 1\u20135","DOI":"10.1109\/ICDECOM.2011.5738540"},{"key":"1468_CR23","doi-asserted-by":"crossref","unstructured":"B. Liu, J. Tao, D. Zhang, Y. Zheng, A novel pitch extraction based on jointly trained deep BLSTM recurrent neural networks with bottleneck features. In 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2017), pp. 336\u2013340","DOI":"10.1109\/ICASSP.2017.7952173"},{"issue":"2","key":"1468_CR24","doi-asserted-by":"publisher","first-page":"710","DOI":"10.1121\/1.4973687","volume":"141","author":"Y Liu","year":"2017","unstructured":"Y. Liu, D.L. Wang, Speaker-dependent multipitch tracking using deep neural networks. J. Acoust. Soc. Am. 141(2), 710\u2013721 (2017)","journal-title":"J. Acoust. Soc. Am."},{"issue":"5","key":"1468_CR25","doi-asserted-by":"publisher","first-page":"367","DOI":"10.1109\/TAU.1972.1162410","volume":"20","author":"J Markel","year":"1972","unstructured":"J. Markel, The SIFT algorithm for fundamental frequency estimation. IEEE Trans. Audio Electroacoust. 20(5), 367\u2013377 (1972)","journal-title":"IEEE Trans. Audio Electroacoust."},{"key":"1468_CR26","doi-asserted-by":"crossref","unstructured":"M. Mauch, S. Dixon, pYIN: a fundamental frequency estimator using probabilistic threshold distributions, 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2014), pp. 659\u2013663","DOI":"10.1109\/ICASSP.2014.6853678"},{"issue":"3","key":"1468_CR27","doi-asserted-by":"publisher","first-page":"1811","DOI":"10.1121\/1.420088","volume":"102","author":"R Meddis","year":"1997","unstructured":"R. Meddis, L. O\u2019Mard, A unitary model of pitch perception. J. Acoust. Soc. Am. 102(3), 1811\u20131820 (1997)","journal-title":"J. Acoust. Soc. Am."},{"key":"1468_CR28","unstructured":"A.M. Noll, Pitch determination of human speech by the harmonic product spectrum, the harmonic surn spectrum, and a maximum likelihood estimate. In: Symposium on Computer Processing in Communication, vol\u00a019 (University of Broodlyn Press, New York), pp 779\u2013797 (1970)"},{"issue":"2","key":"1468_CR29","doi-asserted-by":"publisher","first-page":"519","DOI":"10.1109\/TASL.2006.876756","volume":"15","author":"TL Nwe","year":"2007","unstructured":"T.L. Nwe, H. Li, Exploring vibrato-motivated acoustic features for singer identification. IEEE Trans. Audio Speech Lang. Process. 15(2), 519\u2013530 (2007)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"1468_CR30","doi-asserted-by":"crossref","unstructured":"F. Plante, G.F. Meyer, W.A. Ainsworth, A pitch extraction reference database. In: Fourth European Conference on Speech Communication and Technology (1995)","DOI":"10.21437\/Eurospeech.1995-191"},{"issue":"7","key":"1468_CR31","doi-asserted-by":"publisher","first-page":"459","DOI":"10.1016\/j.nurpra.2016.04.025","volume":"12","author":"A Pylypowich","year":"2016","unstructured":"A. Pylypowich, E. Duff, Differentiating the symptom of dysphonia. J. Nurse Pract. 12(7), 459\u2013466 (2016)","journal-title":"J. Nurse Pract."},{"issue":"1","key":"1468_CR32","doi-asserted-by":"publisher","first-page":"236","DOI":"10.1111\/j.1467-8624.2011.01700.x","volume":"83","author":"C Quam","year":"2012","unstructured":"C. Quam, D. Swingley, Development in children\u2019s interpretation of pitch cues to emotions. Child Dev. 83(1), 236\u2013250 (2012)","journal-title":"Child Dev."},{"issue":"1","key":"1468_CR33","doi-asserted-by":"publisher","first-page":"24","DOI":"10.1109\/TASSP.1977.1162905","volume":"25","author":"L Rabiner","year":"1977","unstructured":"L. Rabiner, On the use of autocorrelation analysis for pitch detection. IEEE Trans. Acoust. Speech Signal Process. 25(1), 24\u201333 (1977)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"1468_CR34","doi-asserted-by":"crossref","unstructured":"P. Rengaswamy, G. Reddy, K.S. Rao, P. Dasgupta, A robust non-parametric and filtering based approach for glottal closure instant detection. In: INTERSPEECH, pp 1795\u20131799 (2016)","DOI":"10.21437\/Interspeech.2016-369"},{"issue":"5","key":"1468_CR35","doi-asserted-by":"publisher","first-page":"353","DOI":"10.1109\/TASSP.1974.1162598","volume":"22","author":"M Ross","year":"1974","unstructured":"M. Ross, H. Shaffer, A. Cohen, R. Freudberg, H. Manley, Average magnitude difference function pitch extractor. IEEE Trans. Acoust. Speech Signal Process. 22(5), 353\u2013362 (1974)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"issue":"3","key":"1468_CR36","doi-asserted-by":"publisher","first-page":"191","DOI":"10.1016\/S0167-6393(97)00002-2","volume":"21","author":"J Rouat","year":"1997","unstructured":"J. Rouat, Y.C. Liu, D. Morissette, A pitch determination and voiced\/unvoiced decision algorithm for noisy speech. Speech Commun. 21(3), 191\u2013207 (1997)","journal-title":"Speech Commun."},{"key":"1468_CR37","doi-asserted-by":"crossref","unstructured":"K. Saino, H. Zen, Y. Nankaku, A. Lee, K. Tokuda, An HMM-based singing voice synthesis system. In: Ninth International Conference on Spoken Language Processing (2006)","DOI":"10.21437\/Interspeech.2006-584"},{"issue":"6","key":"1468_CR38","doi-asserted-by":"publisher","first-page":"1759","DOI":"10.1109\/TASL.2012.2188515","volume":"20","author":"J Salamon","year":"2012","unstructured":"J. Salamon, E. G\u00f3mez, Melody extraction from polyphonic music signals using pitch contour characteristics. IEEE Trans. Audio Speech Lang. Process. 20(6), 1759\u20131770 (2012)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"1","key":"1468_CR39","doi-asserted-by":"publisher","first-page":"588","DOI":"10.1121\/1.421129","volume":"103","author":"ED Scheirer","year":"1998","unstructured":"E.D. Scheirer, Tempo and beat analysis of acoustic musical signals. J. Acoust. Soc. Am. 103(1), 588\u2013601 (1998)","journal-title":"J. Acoust. Soc. Am."},{"key":"1468_CR40","doi-asserted-by":"crossref","unstructured":"M. Schr\u00f6der, Emotional speech synthesis: a review. In: Seventh European Conference on Speech Communication and Technology (2001)","DOI":"10.21437\/Eurospeech.2001-150"},{"issue":"4","key":"1468_CR41","doi-asserted-by":"publisher","first-page":"829","DOI":"10.1121\/1.1910902","volume":"43","author":"MR Schroeder","year":"1968","unstructured":"M.R. Schroeder, Period histogram and product spectrum: new methods for fundamental-frequency measurement. J. Acoust. Soc. Am. 43(4), 829\u2013834 (1968)","journal-title":"J. Acoust. Soc. Am."},{"issue":"6","key":"1468_CR42","doi-asserted-by":"publisher","first-page":"697","DOI":"10.1109\/89.799695","volume":"7","author":"JO Smith","year":"1999","unstructured":"J.O. Smith, J.S. Abel, Bark and ERB bilinear transforms. IEEE Trans. Speech Audio Process. 7(6), 697\u2013708 (1999)","journal-title":"IEEE Trans. Speech Audio Process."},{"issue":"1","key":"1468_CR43","doi-asserted-by":"publisher","first-page":"223","DOI":"10.1121\/1.382239","volume":"65","author":"TV Sreenivas","year":"1979","unstructured":"T.V. Sreenivas, P.V.S. Rao, Pitch extraction from corrupted harmonics of the power spectrum. J. Acoust. Soc. Am. 65(1), 223\u2013228 (1979)","journal-title":"J. Acoust. Soc. Am."},{"key":"1468_CR44","doi-asserted-by":"crossref","unstructured":"X. Sun, A pitch determination algorithm based on subharmonic-to-harmonic ratio. In: Sixth International Conference on Spoken Language Processing (2000)","DOI":"10.21437\/ICSLP.2000-902"},{"key":"1468_CR45","first-page":"518","volume":"495","author":"D Talkin","year":"1995","unstructured":"D. Talkin, A robust algorithm for pitch tracking (RAPT). Speech Coding Synth. 495, 518 (1995)","journal-title":"Speech Coding Synth."},{"issue":"7\u20138","key":"1468_CR46","doi-asserted-by":"publisher","first-page":"841","DOI":"10.1016\/j.specom.2013.03.001","volume":"55","author":"LN Tan","year":"2013","unstructured":"L.N. Tan, A. Alwan, Multi-band summary correlogram-based pitch detection for noisy speech. Speech Commun. 55(7\u20138), 841\u2013856 (2013)","journal-title":"Speech Commun."},{"key":"1468_CR47","doi-asserted-by":"crossref","unstructured":"P. Verma, R.W. Schafer, Frequency estimation from waveforms using multi-layered neural networks. In INTERSPEECH, pp 2165\u20132169 (2016)","DOI":"10.21437\/Interspeech.2016-679"},{"key":"1468_CR48","doi-asserted-by":"crossref","unstructured":"D. Wang, P.C. Loizou, J.H.L. Hansen, F0 estimation in noisy speech based on long-term harmonic feature analysis combined with neural network classification. In Fifteenth Annual Conference of the International Speech Communication Association (2014)","DOI":"10.21437\/Interspeech.2014-250"},{"key":"1468_CR49","unstructured":"A.C. Wilson, R. Roelofs, M. Stern, N. Srebro, B. Recht, The marginal value of adaptive gradient methods in machine learning. In Advances in Neural Information Processing Systems (2017), pp 4148\u20134158"},{"issue":"3","key":"1468_CR50","doi-asserted-by":"publisher","first-page":"229","DOI":"10.1109\/TSA.2003.811539","volume":"11","author":"M Wu","year":"2003","unstructured":"M. Wu, D.L. Wang, G.J. Brown, A multipitch tracking algorithm for noisy speech. IEEE Trans. Speech Audio Process. 11(3), 229\u2013241 (2003)","journal-title":"IEEE Trans. Speech Audio Process."},{"issue":"6","key":"1468_CR51","doi-asserted-by":"publisher","first-page":"4559","DOI":"10.1121\/1.2916590","volume":"123","author":"SA Zahorian","year":"2008","unstructured":"S.A. Zahorian, H. Hu, A spectral\/temporal method for robust fundamental frequency tracking. J. Acoust. Soc. Am. 123(6), 4559\u20134571 (2008)","journal-title":"J. Acoust. Soc. Am."},{"key":"1468_CR52","doi-asserted-by":"crossref","unstructured":"J. Zhang, J. Tang, L.-R. Dai, RNN-BLSTM based multi-pitch estimation. In INTERSPEECH (2016), pp. 1785\u20131789","DOI":"10.21437\/Interspeech.2016-117"}],"container-title":["Circuits, Systems, and Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-020-01468-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00034-020-01468-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-020-01468-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,28]],"date-time":"2022-10-28T13:16:12Z","timestamp":1666962972000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00034-020-01468-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,6,15]]},"references-count":52,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2021,1]]}},"alternative-id":["1468"],"URL":"https:\/\/doi.org\/10.1007\/s00034-020-01468-w","relation":{},"ISSN":["0278-081X","1531-5878"],"issn-type":[{"value":"0278-081X","type":"print"},{"value":"1531-5878","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,6,15]]},"assertion":[{"value":"1 July 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 May 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 May 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 June 2020","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}