{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T17:51:38Z","timestamp":1740160298156,"version":"3.37.3"},"reference-count":36,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2023,3,19]],"date-time":"2023-03-19T00:00:00Z","timestamp":1679184000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,3,19]],"date-time":"2023-03-19T00:00:00Z","timestamp":1679184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2021YFF0901005"],"award-info":[{"award-number":["2021YFF0901005"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61922073","62106244","U20A20229"],"award-info":[{"award-number":["61922073","62106244","U20A20229"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Mach. Learn. &amp; Cyber."],"published-print":{"date-parts":[[2023,9]]},"DOI":"10.1007\/s13042-023-01809-6","type":"journal-article","created":{"date-parts":[[2023,3,27]],"date-time":"2023-03-27T01:54:23Z","timestamp":1679882063000},"page":"2913-2924","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["HMNet: a hierarchical multi-modal network for educational video concept prediction"],"prefix":"10.1007","volume":"14","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4817-8858","authenticated-orcid":false,"given":"Wei","family":"Huang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tong","family":"Xiao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qi","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenya","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianhui","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Enhong","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,3,19]]},"reference":[{"key":"1809_CR1","doi-asserted-by":"crossref","unstructured":"Gabeur V, Sun C, Alahari K, Schmid C (2020) Multi-modal transformer for video retrieval. In: European Conference on Computer Vision, pp 214\u2013229 (2020). Springer","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"1809_CR2","doi-asserted-by":"crossref","unstructured":"Wang X, Zhu L, Yang Y (2021) T2vlad: global-local sequence alignment for text-video retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5079\u20135088","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"1809_CR3","doi-asserted-by":"crossref","unstructured":"Liu S, Fan H, Qian S Chen Y, Ding W, Wang Z (2021) Hit: Hierarchical transformer with momentum contrast for video-text retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 11915\u201311925","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"1809_CR4","doi-asserted-by":"crossref","unstructured":"Shvetsova N, Chen B, Rouditchenko A, Thomas S, Kingsbury B, Feris RS, Harwath D, Glass J, Kuehne H (2022) Everything at once-multi-modal fusion transformer for video retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 20020\u201320029","DOI":"10.1109\/CVPR52688.2022.01939"},{"issue":"2","key":"1809_CR5","doi-asserted-by":"publisher","first-page":"142","DOI":"10.1109\/TLT.2014.2307305","volume":"7","author":"H Yang","year":"2014","unstructured":"Yang H, Meinel C (2014) Content based lecture video retrieval using speech and video text information. IEEE Trans Learn Technol 7(2):142\u2013154","journal-title":"IEEE Trans Learn Technol"},{"key":"1809_CR6","doi-asserted-by":"crossref","unstructured":"Cooper M, Zhao J, Bhatt C, Shamma DA (2018) Moocex: exploring educational video via recommendation. In: Proceedings of the 2018 ACM on International Conference on Multimedia Retrieval, pp 521\u2013524","DOI":"10.1145\/3206025.3206087"},{"issue":"3","key":"1809_CR7","doi-asserted-by":"publisher","first-page":"492","DOI":"10.1109\/TKDE.2018.2885520","volume":"32","author":"X Du","year":"2018","unstructured":"Du X, Yin H, Chen L, Wang Y, Yang Y, Zhou X (2018) Personalized video recommendation using rich contents from videos. IEEE Trans Knowl Data Eng 32(3):492\u2013505","journal-title":"IEEE Trans Knowl Data Eng"},{"issue":"1","key":"1809_CR8","doi-asserted-by":"publisher","first-page":"967","DOI":"10.1007\/s11042-016-4282-5","volume":"77","author":"M Furini","year":"2018","unstructured":"Furini M (2018) On introducing timed tag-clouds in video lectures indexing. Multimed Tools Appl 77(1):967\u2013984","journal-title":"Multimed Tools Appl"},{"key":"1809_CR9","doi-asserted-by":"crossref","unstructured":"Husain M, Meena S (2019) Multimodal fusion of speech and text using semi-supervised lda for indexing lecture videos. In: 2019 National Conference on Communications (NCC), pp 1\u20136. IEEE","DOI":"10.1109\/NCC.2019.8732253"},{"key":"1809_CR10","doi-asserted-by":"crossref","unstructured":"Cagliero L, Canale L, Farinetti L (2019) Visa: a supervised approach to indexing video lectures with semantic annotations. In: 2019 IEEE 43rd Annual Computer Software and Applications Conference (COMPSAC), vol. 1, pp 226\u2013235. IEEE","DOI":"10.1109\/COMPSAC.2019.00041"},{"key":"1809_CR11","unstructured":"Weston J, Bengio S, Usunier N (2011) Wsabie: scaling up to large vocabulary image annotation. In: Twenty-Second International Joint Conference on Artificial Intelligence"},{"key":"1809_CR12","unstructured":"Frome A, Corrado GS, Shlens J, Bengio S, Dean J, Ranzato M, Mikolov T (2013) Devise: a deep visual-semantic embedding model. Advances in neural information processing systems, 26"},{"key":"1809_CR13","doi-asserted-by":"crossref","unstructured":"Wu C-Y, Feichtenhofer C, Fan H, He K, Krahenbuhl P, Girshick R (2019) Long-term feature banks for detailed video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 284\u2013293","DOI":"10.1109\/CVPR.2019.00037"},{"key":"1809_CR14","doi-asserted-by":"crossref","unstructured":"Guo PJ, Kim J, Rubin R (2014) How video production affects student engagement: an empirical study of mooc videos. In: Proceedings of the First ACM Conference on Learning@ Scale Conference, pp 41\u201350","DOI":"10.1145\/2556325.2566239"},{"key":"1809_CR15","doi-asserted-by":"crossref","unstructured":"Wang X, Huang W, Liu Q, Yin Y, Huang Z, Wu L, Ma J, Wang X (2020) Fine-grained similarity measurement between educational videos and exercises. In: Proceedings of the 28th ACM International Conference on Multimedia, pp 331\u2013339","DOI":"10.1145\/3394171.3413783"},{"key":"1809_CR16","doi-asserted-by":"crossref","unstructured":"Papazoglou A, Ferrari V (2013) Fast object segmentation in unconstrained video. 2013 IEEE International Conference on Computer Vision, 1777\u20131784","DOI":"10.1109\/ICCV.2013.223"},{"key":"1809_CR17","doi-asserted-by":"crossref","unstructured":"Yu C-P, Le HM, Zelinsky GJ, Samaras D (2015) Efficient video segmentation using parametric graph partitioning. 2015 IEEE International Conference on Computer Vision (ICCV), 3155\u20133163","DOI":"10.1109\/ICCV.2015.361"},{"key":"1809_CR18","doi-asserted-by":"crossref","unstructured":"Wattanarachothai W, Patanukhom K (2015) Key frame extraction for text based video retrieval using maximally stable extremal regions. In: 2015 1st International Conference on Industrial Networks and Intelligent Systems (INISCom), pp 29\u201337. IEEE","DOI":"10.4108\/icst.iniscom.2015.258410"},{"key":"1809_CR19","doi-asserted-by":"crossref","unstructured":"Jain S, Wang X, Gonzalez JE (2019) Accel: a corrective fusion network for efficient semantic segmentation on video. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 8858\u20138867","DOI":"10.1109\/CVPR.2019.00907"},{"key":"1809_CR20","doi-asserted-by":"publisher","first-page":"66322","DOI":"10.1109\/ACCESS.2018.2878899","volume":"6","author":"X Bai","year":"2018","unstructured":"Bai X, Yang M, Lyu P, Xu Y, Luo J (2018) Integrating scene text and visual appearance for fine-grained image classification. IEEE Access 6:66322\u201366335","journal-title":"IEEE Access"},{"key":"1809_CR21","first-page":"8","volume":"3","author":"A Wu","year":"2018","unstructured":"Wu A, Han Y (2018) Multi-modal circulant fusion for video-to-language and backward. IJCAI 3:8","journal-title":"IJCAI"},{"key":"1809_CR22","doi-asserted-by":"crossref","unstructured":"Long X, Gan C, Melo G, Liu X, Li Y, Li F, Wen S (2018) Multimodal keyless attention fusion for video classification. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32","DOI":"10.1609\/aaai.v32i1.12319"},{"issue":"2","key":"1809_CR23","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1007\/s10994-008-5077-3","volume":"73","author":"C Vens","year":"2008","unstructured":"Vens C, Struyf J, Schietgat L, D\u017eeroski S, Blockeel H (2008) Decision trees for hierarchical multi-label classification. Mach Learn 73(2):185","journal-title":"Mach Learn"},{"issue":"1","key":"1809_CR24","doi-asserted-by":"publisher","first-page":"373","DOI":"10.1186\/s12859-016-1232-1","volume":"17","author":"R Cerri","year":"2016","unstructured":"Cerri R, Barros RC, de Carvalho AC, Jin Y (2016) Reduction strategies for hierarchical multi-label classification in protein function prediction. BMC Bioinform 17(1):373","journal-title":"BMC Bioinform"},{"key":"1809_CR25","doi-asserted-by":"crossref","unstructured":"Wehrmann J, Cerri R, Barros R (2018) Hierarchical multi-label classification networks. In: International Conference on Machine Learning, pp 5225\u20135234","DOI":"10.1145\/3019612.3019664"},{"key":"1809_CR26","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) Glove: global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp 1532\u20131543","DOI":"10.3115\/v1\/D14-1162"},{"key":"1809_CR27","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Advances in neural information processing systems, 30"},{"key":"1809_CR28","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"1809_CR29","unstructured":"Kingma DP, Ba J (2014) Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980"},{"issue":"8","key":"1809_CR30","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"key":"1809_CR31","doi-asserted-by":"crossref","unstructured":"Wang X, Huang W, Liu Q, Yin Y, Huang Z, Wu L, Ma J, Wang X (2020) Fine-grained similarity measurement between educational videos and exercises. Proceedings of the 28th ACM International Conference on Multimedia","DOI":"10.1145\/3394171.3413783"},{"key":"1809_CR32","unstructured":"Glorot X, Bengio Y (2010) Understanding the difficulty of training deep feedforward neural networks. In: Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics, pp 249\u2013256. JMLR Workshop and Conference Proceedings"},{"issue":"1","key":"1809_CR33","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava N, Hinton G, Krizhevsky A, Sutskever I, Salakhutdinov R (2014) Dropout: a simple way to prevent neural networks from overfitting. J Mach Learn Res 15(1):1929\u20131958","journal-title":"J Mach Learn Res"},{"key":"1809_CR34","unstructured":"Paszke A, Gross S, Massa F, Lerer A, Bradbury J, Chanan G, Killeen T, Lin Z, Gimelshein N, Antiga L, et al (2019) Pytorch: an imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703"},{"issue":"3","key":"1809_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.4018\/jdwm.2007070101","volume":"3","author":"G Tsoumakas","year":"2007","unstructured":"Tsoumakas G, Katakis I (2007) Multi-label classification: an overview. Int J Data Warehous Mining (IJDWM) 3(3):1\u201313","journal-title":"International Journal of Data Warehousing and Mining (IJDWM)"},{"key":"1809_CR36","unstructured":"Giunchiglia E, Lukasiewicz T (2020) Coherent hierarchical multi-label classification networks. In: 34th Conference on Neural Information Processing Systems (NeurIPS 2020), Vancouver, Canada"}],"container-title":["International Journal of Machine Learning and Cybernetics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-023-01809-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13042-023-01809-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-023-01809-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,21]],"date-time":"2023-07-21T04:28:19Z","timestamp":1689913699000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13042-023-01809-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3,19]]},"references-count":36,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2023,9]]}},"alternative-id":["1809"],"URL":"https:\/\/doi.org\/10.1007\/s13042-023-01809-6","relation":{},"ISSN":["1868-8071","1868-808X"],"issn-type":[{"type":"print","value":"1868-8071"},{"type":"electronic","value":"1868-808X"}],"subject":[],"published":{"date-parts":[[2023,3,19]]},"assertion":[{"value":"23 August 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 February 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 March 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}