{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T21:15:05Z","timestamp":1764018905647,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,8,5]],"date-time":"2020-08-05T00:00:00Z","timestamp":1596585600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,8,5]]},"DOI":"10.1145\/3421558.3421563","type":"proceedings-article","created":{"date-parts":[[2020,11,26]],"date-time":"2020-11-26T20:31:31Z","timestamp":1606422691000},"page":"31-39","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["A Survey of Lipreading Methods Based on Deep Learning"],"prefix":"10.1145","author":[{"given":"MINGFENG","family":"HAO","sequence":"first","affiliation":[{"name":"Xinjiang University, China"}]},{"given":"MUTELEP","family":"MAMUT","sequence":"additional","affiliation":[{"name":"Library of Xinjiang University, China"}]},{"given":"KURBAN","family":"UBUL","sequence":"additional","affiliation":[{"name":"Xinjiang University, China"}]}],"member":"320","published-online":{"date-parts":[[2020,11,25]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"11","article-title":"Design and Implementation of a Real-Time Lipreading System Using PCA and HMM","volume":"7","author":"Lee C.","year":"2004","unstructured":"Lee , C. , Lee , E. , Jung , S. and Lee , S . Design and Implementation of a Real-Time Lipreading System Using PCA and HMM . Journal of Korea Multimedia Society , 7 , 11 ( 2004 ), 1597-1609. Lee, C., Lee, E., Jung, S. and Lee, S. Design and Implementation of a Real-Time Lipreading System Using PCA and HMM. Journal of Korea Multimedia Society, 7, 11 (2004), 1597-1609.","journal-title":"Journal of Korea Multimedia Society"},{"key":"e_1_3_2_1_2_1","volume-title":"City","author":"Yao J.","year":"2016","unstructured":"Yao , J. and Kaifeng , Z . Evaluation Model of the Artist Based on Fuzzy Membership to Improve the Principal Component Analysis of Robust Kernel . City , 2016 . Yao, J. and Kaifeng, Z. Evaluation Model of the Artist Based on Fuzzy Membership to Improve the Principal Component Analysis of Robust Kernel. City, 2016."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2010.09.119"},{"key":"e_1_3_2_1_4_1","volume-title":"City","author":"Matthews I.","year":"2001","unstructured":"Matthews , I. , Potamianos , G. , Neti , C. and Luettin , J . A comparison of model and transform-based visual features for audio-visual LVCSR . City , 2001 . Matthews, I., Potamianos, G., Neti, C. and Luettin, J. A comparison of model and transform-based visual features for audio-visual LVCSR. City, 2001."},{"key":"e_1_3_2_1_5_1","volume-title":"City","author":"Morade S. S.","year":"2014","unstructured":"Morade , S. S. and Patnaik , S . Lip reading using DWT and LSDA . City , 2014 . Morade, S. S. and Patnaik, S. Lip reading using DWT and LSDA. City, 2014."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2017.34"},{"key":"e_1_3_2_1_7_1","volume-title":"City","author":"Shaikh A. A.","year":"2010","unstructured":"Shaikh , A. A. , Kumar , D. K. , Yau , W. C. , Azemin , M. Z. C. and Gubbi , J . Lip reading using optical flow and support vector machines . City , 2010 . Shaikh, A. A., Kumar, D. K., Yau, W. C., Azemin, M. Z. C. and Gubbi, J. Lip reading using optical flow and support vector machines. City, 2010."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1002\/scj.4690220607"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2008.2004924"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1006\/cviu.1996.0570"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2017.07.001"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2016.03.003"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"e_1_3_2_1_14_1","volume-title":"City","author":"Anina I.","year":"2015","unstructured":"Anina , I. , Zhou , Z. , Zhao , G. and Pietik\u00e4inen , M . OuluVS2: A multi-view audiovisual database for non-rigid mouth motion analysis . City , 2015 . Anina, I., Zhou, Z., Zhao, G. and Pietik\u00e4inen, M. OuluVS2: A multi-view audiovisual database for non-rigid mouth motion analysis. City, 2015."},{"key":"e_1_3_2_1_15_1","volume-title":"Lip reading in the wild","author":"Chung J. S.","year":"2016","unstructured":"Chung , J. S. and Zisserman , A . Lip reading in the wild . Springer , City , 2016 . Chung, J. S. and Zisserman, A. Lip reading in the wild. Springer, City, 2016."},{"key":"e_1_3_2_1_16_1","volume-title":"City","author":"Chung J. S.","year":"2017","unstructured":"Chung , J. S. , Senior , A. , Vinyals , O. and Zisserman , A . Lip reading sentences in the wild . City , 2017 . Chung, J. S., Senior, A., Vinyals, O. and Zisserman, A. Lip reading sentences in the wild. City, 2017."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2019.8756582"},{"key":"e_1_3_2_1_18_1","volume-title":"Large-scale visual speech recognition. arXiv preprint arXiv:1807.05162","author":"Shillingford B.","year":"2018","unstructured":"Shillingford , B. , Assael , Y. , Hoffman , M. W. , Paine , T. , Hughes , C. , Prabhu , U. , Liao , H. , Sak , H. , Rao , K. and Bennett , L . Large-scale visual speech recognition. arXiv preprint arXiv:1807.05162 ( 2018 ). Shillingford, B., Assael, Y., Hoffman, M. W., Paine, T., Hughes, C., Prabhu, U., Liao, H., Sak, H., Rao, K. and Bennett, L. Large-scale visual speech recognition. arXiv preprint arXiv:1807.05162 (2018)."},{"key":"e_1_3_2_1_19_1","volume-title":"Lip reading in profile","author":"Chung J.","year":"2017","unstructured":"Chung , J. and Zisserman , A . Lip reading in profile ( 2017 ). Chung, J. and Zisserman, A. Lip reading in profile (2017)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461596"},{"key":"e_1_3_2_1_21_1","volume-title":"City","author":"Afouras T.","year":"2018","unstructured":"Afouras , T. , Son Chung , J. and Zisserman , A . LRS3-TED: a large-scale dataset for visual speech recognition . City , 2018 . Afouras, T., Son Chung, J. and Zisserman, A. LRS3-TED: a large-scale dataset for visual speech recognition. City, 2018."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIS.2016.7550888"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952625"},{"key":"e_1_3_2_1_24_1","volume-title":"Learning Spatio-Temporal Features with Two-Stream Deep 3D CNNs for Lipreading. arXiv preprint arXiv:1905.02540","author":"Weng X.","year":"2019","unstructured":"Weng , X. and Kitani , K . Learning Spatio-Temporal Features with Two-Stream Deep 3D CNNs for Lipreading. arXiv preprint arXiv:1905.02540 ( 2019 ). Weng, X. and Kitani, K. Learning Spatio-Temporal Features with Two-Stream Deep 3D CNNs for Lipreading. arXiv preprint arXiv:1905.02540 (2019)."},{"key":"e_1_3_2_1_25_1","volume-title":"End-to-end multi-view lipreading. arXiv preprint arXiv:1709.00443","author":"Petridis S.","year":"2017","unstructured":"Petridis , S. , Wang , Y. , Li , Z. and Pantic , M . End-to-end multi-view lipreading. arXiv preprint arXiv:1709.00443 ( 2017 ). Petridis, S., Wang, Y., Li, Z. and Pantic, M. End-to-end multi-view lipreading. arXiv preprint arXiv:1709.00443 (2017)."},{"key":"e_1_3_2_1_26_1","volume-title":"Concatenated frame image based cnn for visual speech recognition","author":"Saitoh T.","year":"2016","unstructured":"Saitoh , T. , Zhou , Z. , Zhao , G. and Pietik\u00e4inen , M . Concatenated frame image based cnn for visual speech recognition . Springer , City , 2016 . Saitoh, T., Zhou, Z., Zhao, G. and Pietik\u00e4inen, M. Concatenated frame image based cnn for visual speech recognition. Springer, City, 2016."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2927166"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/IC3.2018.8530509"},{"key":"e_1_3_2_1_29_1","first-page":"1149","volume-title":"Proc. Annu. Conf. Int. Speech. Commun. Assoc., INTERSPEECH Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH","author":"Noda K.","year":"2014","unstructured":"Noda , K. , Ogata , T. , Yamaguchi , Y. , Okuno , H. G. , Nakadai , K. and th Annual Conference of the International Speech Communication Association: Celebrating the Diversity of Spoken Languages, I. Lipreading using convolutional neural network . Proc. Annu. Conf. Int. Speech. Commun. Assoc., INTERSPEECH Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH ( 2014 ), 1149 - 1153 . Noda, K., Ogata, T., Yamaguchi, Y., Okuno, H. G., Nakadai, K. and th Annual Conference of the International Speech Communication Association: Celebrating the Diversity of Spoken Languages, I. Lipreading using convolutional neural network. Proc. Annu. Conf. Int. Speech. Commun. Assoc., INTERSPEECH Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH (2014), 1149-1153."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-014-0629-7"},{"key":"e_1_3_2_1_31_1","volume-title":"Lip reading using CNN and LSTM. Technical report","author":"Garg A.","year":"2016","unstructured":"Garg , A. , Noyola , J. and Bagadia , S . Lip reading using CNN and LSTM. Technical report , Stanford University , CS231n project report, 2016 . Garg, A., Noyola, J. and Bagadia, S. Lip reading using CNN and LSTM. Technical report, Stanford University, CS231n project report, 2016."},{"key":"e_1_3_2_1_32_1","volume-title":"Out of time: automated lip sync in the wild","author":"Chung J. S.","year":"2016","unstructured":"Chung , J. S. and Zisserman , A . Out of time: automated lip sync in the wild . Springer , City , 2016 . Chung, J. S. and Zisserman, A. Out of time: automated lip sync in the wild. Springer, City, 2016."},{"key":"e_1_3_2_1_33_1","volume-title":"Lip Reading with Hahn Convolutional Neural Networks. Image and Vision Computing","author":"Mesbah A.","year":"2019","unstructured":"Mesbah , A. , Berrahou , A. , Hammouchi , H. , Berbia , H. , Qjidaa , H. and Daoudi , M . Lip Reading with Hahn Convolutional Neural Networks. Image and Vision Computing ( 2019 ). Mesbah, A., Berrahou, A., Hammouchi, H., Berbia, H., Qjidaa, H. and Daoudi, M. Lip Reading with Hahn Convolutional Neural Networks. Image and Vision Computing (2019)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"e_1_3_2_1_35_1","volume-title":"City","author":"Tran D.","year":"2015","unstructured":"Tran , D. , Bourdev , L. , Fergus , R. , Torresani , L. and Paluri , M . Learning spatiotemporal features with 3d convolutional networks . City , 2015 . Tran, D., Bourdev, L., Fergus, R., Torresani, L. and Paluri, M. Learning spatiotemporal features with 3d convolutional networks. City, 2015."},{"key":"e_1_3_2_1_36_1","volume-title":"Convnet architecture search for spatiotemporal feature learning. arXiv preprint arXiv:1708.05038","author":"Tran D.","year":"2017","unstructured":"Tran , D. , Ray , J. , Shou , Z. , Chang , S.-F. and Paluri , M . Convnet architecture search for spatiotemporal feature learning. arXiv preprint arXiv:1708.05038 ( 2017 ). Tran, D., Ray, J., Shou, Z., Chang, S.-F. and Paluri, M. Convnet architecture search for spatiotemporal feature learning. arXiv preprint arXiv:1708.05038 (2017)."},{"key":"e_1_3_2_1_37_1","volume-title":"City","author":"Qiu Z.","year":"2017","unstructured":"Qiu , Z. , Yao , T. and Mei , T . Learning spatio-temporal representation with pseudo-3d residual networks . City , 2017 . Qiu, Z., Yao, T. and Mei, T. Learning spatio-temporal representation with pseudo-3d residual networks. City, 2017."},{"key":"e_1_3_2_1_38_1","volume-title":"Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599","author":"Assael Y. M.","year":"2016","unstructured":"Assael , Y. M. , Shillingford , B. , Whiteson , S. and De Freitas , N. Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599 ( 2016 ). Assael, Y. M., Shillingford, B., Whiteson, S. and De Freitas, N. Lipnet: End-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599 (2016)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2017.2761539"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00088"},{"key":"e_1_3_2_1_41_1","volume-title":"City","author":"Huang G.","year":"2017","unstructured":"Huang , G. , Liu , Z. , Van Der Maaten , L. and Weinberger , K. Q . Densely connected convolutional networks . City , 2017 . Huang, G., Liu, Z., Van Der Maaten, L. and Weinberger, K. Q. Densely connected convolutional networks. City, 2017."},{"key":"e_1_3_2_1_42_1","volume-title":"City","author":"He K.","year":"2016","unstructured":"He , K. , Zhang , X. , Ren , S. and Sun , J . Deep residual learning for image recognition . City , 2016 . He, K., Zhang, X., Ren, S. and Sun, J. Deep residual learning for image recognition. City, 2016."},{"key":"e_1_3_2_1_43_1","volume-title":"Combining residual networks with LSTMs for lipreading. arXiv preprint arXiv:1703.04105","author":"Stafylakis T.","year":"2017","unstructured":"Stafylakis , T. and Tzimiropoulos , G . Combining residual networks with LSTMs for lipreading. arXiv preprint arXiv:1703.04105 ( 2017 ). Stafylakis, T. and Tzimiropoulos, G. Combining residual networks with LSTMs for lipreading. arXiv preprint arXiv:1703.04105 (2017)."},{"key":"e_1_3_2_1_44_1","volume-title":"LipReading with 3D-2D-CNN BLSTM-HMM and word-CTC models. arXiv preprint arXiv:1906.12170","author":"Margam D. K.","year":"2019","unstructured":"Margam , D. K. , Aralikatti , R. , Sharma , T. , Thanda , A. , Roy , S. and Venkatesan , S. M . LipReading with 3D-2D-CNN BLSTM-HMM and word-CTC models. arXiv preprint arXiv:1906.12170 ( 2019 ). Margam, D. K., Aralikatti, R., Sharma, T., Thanda, A., Roy, S. and Venkatesan, S. M. LipReading with 3D-2D-CNN BLSTM-HMM and word-CTC models. arXiv preprint arXiv:1906.12170 (2019)."},{"key":"e_1_3_2_1_45_1","volume-title":"City","author":"Ngiam J.","year":"2011","unstructured":"Ngiam , J. , Khosla , A. , Kim , M. , Nam , J. , Lee , H. and Ng , A. Y . Multimodal deep learning . City , 2011 . Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H. and Ng, A. Y. Multimodal deep learning. City, 2011."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472852"},{"key":"e_1_3_2_1_47_1","volume-title":"Improving speaker-independent lipreading with domain-adversarial training. arXiv preprint arXiv:1708.01565","author":"Wand M.","year":"2017","unstructured":"Wand , M. and Schmidhuber , J . Improving speaker-independent lipreading with domain-adversarial training. arXiv preprint arXiv:1708.01565 ( 2017 ). Wand, M. and Schmidhuber, J. Improving speaker-independent lipreading with domain-adversarial training. arXiv preprint arXiv:1708.01565 (2017)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461900"},{"key":"e_1_3_2_1_49_1","volume-title":"End-to-end audiovisual fusion with LSTMs. arXiv preprint arXiv:1709.04343","author":"Petridis S.","year":"2017","unstructured":"Petridis , S. , Wang , Y. , Li , Z. and Pantic , M . End-to-end audiovisual fusion with LSTMs. arXiv preprint arXiv:1709.04343 ( 2017 ). Petridis, S., Wang, Y., Li, Z. and Pantic, M. End-to-end audiovisual fusion with LSTMs. arXiv preprint arXiv:1709.04343 (2017)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461326"},{"key":"e_1_3_2_1_51_1","volume-title":"Learning from Videos with Deep Convolutional LSTM Networks. arXiv preprint arXiv:1904.04817","author":"Courtney L.","year":"2019","unstructured":"Courtney , L. and Sreenivas , R . Learning from Videos with Deep Convolutional LSTM Networks. arXiv preprint arXiv:1904.04817 ( 2019 ). Courtney, L. and Sreenivas, R. Learning from Videos with Deep Convolutional LSTM Networks. arXiv preprint arXiv:1904.04817 (2019)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/PRIA.2017.7983045"},{"key":"e_1_3_2_1_53_1","volume-title":"City","author":"Ninomiya H.","year":"2015","unstructured":"Ninomiya , H. , Kitaoka , N. , Tamura , S. , Iribe , Y. and Takeda , K . Integration of deep bottleneck features for audio-visual speech recognition . City , 2015 . Ninomiya, H., Kitaoka, N., Tamura, S., Iribe, Y. and Takeda, K. Integration of deep bottleneck features for audio-visual speech recognition. City, 2015."},{"key":"e_1_3_2_1_54_1","volume-title":"Long short-term memory. Neural computation, 9, 8","author":"Hochreiter S.","year":"1997","unstructured":"Hochreiter , S. and Schmidhuber , J . Long short-term memory. Neural computation, 9, 8 ( 1997 ), 1735-1780. Hochreiter, S. and Schmidhuber, J. Long short-term memory. Neural computation, 9, 8 (1997), 1735-1780."},{"key":"e_1_3_2_1_55_1","volume-title":"Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078","author":"Cho K.","year":"2014","unstructured":"Cho , K. , Van Merri\u00ebnboer , B. , Gulcehre , C. , Bahdanau , D. , Bougares , F. , Schwenk , H. and Bengio , Y . Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 ( 2014 ). Cho, K., Van Merri\u00ebnboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H. and Bengio, Y. Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)."},{"key":"e_1_3_2_1_56_1","volume-title":"City","author":"Vaswani A.","year":"2017","unstructured":"Vaswani , A. , Shazeer , N. , Parmar , N. , Uszkoreit , J. , Jones , L. , Gomez , A. N. , Kaiser , \u0141. and Polosukhin, I . Attention is all you need . City , 2017 . Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141. and Polosukhin, I. Attention is all you need. City, 2017."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_1_58_1","volume-title":"Deep lip reading: a comparison of models and an online application. arXiv preprint arXiv:1806.06053","author":"Afouras T.","year":"2018","unstructured":"Afouras , T. , Chung , J. S. and Zisserman , A . Deep lip reading: a comparison of models and an online application. arXiv preprint arXiv:1806.06053 ( 2018 ). Afouras, T., Chung, J. S. and Zisserman, A. Deep lip reading: a comparison of models and an online application. arXiv preprint arXiv:1806.06053 (2018)."},{"key":"e_1_3_2_1_59_1","volume-title":"City","author":"Chollet F.","year":"2017","unstructured":"Chollet , F. Xception : Deep learning with depthwise separable convolutions . City , 2017 . Chollet, F. Xception: Deep learning with depthwise separable convolutions. City, 2017."},{"key":"e_1_3_2_1_60_1","volume-title":"Deep audio-visual speech recognition","author":"Afouras T.","year":"2018","unstructured":"Afouras , T. , Chung , J. S. , Senior , A. , Vinyals , O. and Zisserman , A . Deep audio-visual speech recognition . IEEE transactions on pattern analysis and machine intelligence ( 2018 ). Afouras, T., Chung, J. S., Senior, A., Vinyals, O. and Zisserman, A. Deep audio-visual speech recognition. IEEE transactions on pattern analysis and machine intelligence (2018)."},{"key":"e_1_3_2_1_61_1","volume-title":"-P. Visual speech recognition using PCA networks and LSTMs in a tandem GMM-HMM system","author":"Zimmermann M.","year":"2016","unstructured":"Zimmermann , M. , Ghazi , M. M. , Ekenel , H. K. and Thiran , J . -P. Visual speech recognition using PCA networks and LSTMs in a tandem GMM-HMM system . Springer , City , 2016 . Zimmermann, M., Ghazi, M. M., Ekenel, H. K. and Thiran, J.-P. Visual speech recognition using PCA networks and LSTMs in a tandem GMM-HMM system. Springer, City, 2016."},{"key":"e_1_3_2_1_62_1","volume-title":"-E. Multi-view automatic lip-reading using neural network","author":"Lee D.","year":"2016","unstructured":"Lee , D. , Lee , J. and Kim , K . -E. Multi-view automatic lip-reading using neural network . Springer , City , 2016 . Lee, D., Lee, J. and Kim, K.-E. Multi-view automatic lip-reading using neural network. Springer, City, 2016."},{"key":"e_1_3_2_1_63_1","volume-title":"Network in network. arXiv preprint arXiv:1312.4400","author":"Lin M.","year":"2013","unstructured":"Lin , M. , Chen , Q. and Yan , S . Network in network. arXiv preprint arXiv:1312.4400 ( 2013 ). Lin, M., Chen, Q. and Yan, S. Network in network. arXiv preprint arXiv:1312.4400 (2013)."},{"key":"e_1_3_2_1_64_1","volume-title":"City","author":"Krizhevsky A.","year":"2012","unstructured":"Krizhevsky , A. , Sutskever , I. and Hinton , G. E . Imagenet classification with deep convolutional neural networks . City , 2012 . Krizhevsky, A., Sutskever, I. and Hinton, G. E. Imagenet classification with deep convolutional neural networks. City, 2012."},{"key":"e_1_3_2_1_65_1","volume-title":"City","author":"Szegedy C.","year":"2015","unstructured":"Szegedy , C. , Liu , W. , Jia , Y. , Sermanet , P. , Reed , S. , Anguelov , D. , Erhan , D. , Vanhoucke , V. and Rabinovich , A . Going deeper with convolutions . City , 2015 . Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V. and Rabinovich, A. Going deeper with convolutions. City, 2015."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462280"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2018.02.001"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639643"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461347"}],"event":{"name":"IPMV 2020: 2020 2nd International Conference on Image Processing and Machine Vision","acronym":"IPMV 2020","location":"Bangkok Thailand"},"container-title":["2020 2nd International Conference on Image Processing and Machine Vision"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3421558.3421563","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3421558.3421563","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T17:49:22Z","timestamp":1750268962000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3421558.3421563"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,8,5]]},"references-count":69,"alternative-id":["10.1145\/3421558.3421563","10.1145\/3421558"],"URL":"https:\/\/doi.org\/10.1145\/3421558.3421563","relation":{},"subject":[],"published":{"date-parts":[[2020,8,5]]},"assertion":[{"value":"2020-11-25","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}