{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T06:28:19Z","timestamp":1772519299645,"version":"3.50.1"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"40","license":[{"start":{"date-parts":[[2023,11,20]],"date-time":"2023-11-20T00:00:00Z","timestamp":1700438400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,11,20]],"date-time":"2023-11-20T00:00:00Z","timestamp":1700438400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-023-17654-2","type":"journal-article","created":{"date-parts":[[2023,11,20]],"date-time":"2023-11-20T09:02:31Z","timestamp":1700470951000},"page":"88523-88541","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["AHMN: A multi-modal network for long MOOC videos chapter segmentation"],"prefix":"10.1007","volume":"83","author":[{"given":"Jiasong","family":"Wu","sequence":"first","affiliation":[]},{"given":"Yu","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Youyong","family":"Kong","sequence":"additional","affiliation":[]},{"given":"Huazhong","family":"Shu","sequence":"additional","affiliation":[]},{"given":"Lotfi","family":"Senhadji","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,11,20]]},"reference":[{"key":"17654_CR1","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1725\u20131732","DOI":"10.1109\/CVPR.2014.223"},{"key":"17654_CR2","doi-asserted-by":"crossref","unstructured":"Wang L, Li W, Li W, Van\u00a0Gool L (2018) Appearance-and-relation networks for video classification. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1430\u20131439","DOI":"10.1109\/CVPR.2018.00155"},{"key":"17654_CR3","doi-asserted-by":"crossref","unstructured":"Si C, Chen W, Wang W, Wang L, Tan T (2019) An attention enhanced graph convolutional lstm network for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1227\u20131236","DOI":"10.1109\/CVPR.2019.00132"},{"key":"17654_CR4","doi-asserted-by":"publisher","unstructured":"Xiao Y, Yuan Q, Jiang K, Jin X, He J, Zhang L, Lin C-w (2023) Local-global temporal difference learning for satellite video super-resolution. IEEE Trans Circ Syst Vid Technol 1\u201316. https:\/\/doi.org\/10.1109\/TCSVT.2023.3312321","DOI":"10.1109\/TCSVT.2023.3312321"},{"key":"17654_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TGRS.2021.3107352","volume":"60","author":"Y Xiao","year":"2022","unstructured":"Xiao Y, Su X, Yuan Q, Liu D, Shen H, Zhang L (2022) Satellite video super-resolution via multiscale deformable convolution alignment and temporal grouping projection. IEEE Trans Geosci Remote Sens 60:1\u201319. https:\/\/doi.org\/10.1109\/TGRS.2021.3107352","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"17654_CR6","doi-asserted-by":"crossref","unstructured":"Mukherjee A, Tiwari S, Chowdhury T, Chakraborty T (2019) Automatic curation of content tables for educational videos. In: Proceedings of the 42nd international ACM SIGIR conference on research and development in information retrieval, pp 1329\u20131332","DOI":"10.1145\/3331184.3331400"},{"key":"17654_CR7","unstructured":"Bendraou Y (2017) Video shot boundary detection and key-frame extraction using mathematical models. Universit\u00e9 du littoral c\u00f4te d\u2019opale"},{"issue":"12","key":"17654_CR8","doi-asserted-by":"publisher","first-page":"5136","DOI":"10.1109\/TIP.2013.2282081","volume":"22","author":"Z-M Lu","year":"2013","unstructured":"Lu Z-M, Shi Y (2013) Fast video shot boundary detection based on svd and pattern matching. IEEE Trans Image Process 22(12):5136\u20135145","journal-title":"IEEE Trans Image Process"},{"key":"17654_CR9","doi-asserted-by":"crossref","unstructured":"Shao H, Qu Y, Cui W (2015) Shot boundary detection algorithm based on hsv histogram and hog feature. In: 5th International conference on advanced engineering materials and technology, pp 951\u2013957","DOI":"10.2991\/icaemt-15.2015.181"},{"key":"17654_CR10","doi-asserted-by":"crossref","unstructured":"Shou Z, Wang D, Chang S-F (2016) Temporal action localization in untrimmed videos via multi-stage cnns. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1049\u20131058","DOI":"10.1109\/CVPR.2016.119"},{"key":"17654_CR11","doi-asserted-by":"crossref","unstructured":"Lin T, Liu X, Li X, Ding E, Wen S (2019) Bmn: Boundary-matching network for temporal action proposal generation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 3889\u20133898","DOI":"10.1109\/ICCV.2019.00399"},{"key":"17654_CR12","doi-asserted-by":"crossref","unstructured":"Lin T, Zhao X, Shou Z (2017) Single shot temporal action detection. In: Proceedings of the 25th ACM international conference on multimedia, pp 988\u2013996","DOI":"10.1145\/3123266.3123343"},{"key":"17654_CR13","doi-asserted-by":"crossref","unstructured":"Tapaswi M, Bauml M, Stiefelhagen R (2014) Storygraphs: visualizing character interactions as a timeline. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 827\u2013834","DOI":"10.1109\/CVPR.2014.111"},{"key":"17654_CR14","doi-asserted-by":"crossref","unstructured":"Rao A, Xu L, Xiong Y, Xu G, Huang Q, Zhou B, Lin D (2020) A local-to-global approach to multi-modal movie scene segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10146\u201310155","DOI":"10.1109\/CVPR42600.2020.01016"},{"key":"17654_CR15","doi-asserted-by":"crossref","unstructured":"Ma D, Zhang X, Ouyang X, Agam G (2017) Lecture vdeo indexing using boosted margin maximizing neural networks. In: 2017 16th IEEE international conference on machine learning and applications (ICMLA), pp 221\u2013227. IEEE","DOI":"10.1109\/ICMLA.2017.0-155"},{"key":"17654_CR16","doi-asserted-by":"crossref","unstructured":"Basu S, Yu Y, Singh VK, Zimmermann R (2016) Videopedia: Lecture video recommendation for educational blogs using topic modeling. In: International conference on multimedia modeling, pp 238\u2013250. Springer","DOI":"10.1007\/978-3-319-27671-7_20"},{"issue":"2","key":"17654_CR17","doi-asserted-by":"publisher","first-page":"27","DOI":"10.4018\/jthi.2005040102","volume":"1","author":"M Lin","year":"2005","unstructured":"Lin M, Chau M, Cao J, Nunamaker JF Jr (2005) Automated video segmentation for lecture videos: A linguistics-based approach. Int. J. Technol. Hum. Interact (IJTHI) 1(2):27\u201345","journal-title":"Int. J. Technol. Hum. Interact (IJTHI)"},{"key":"17654_CR18","doi-asserted-by":"crossref","unstructured":"Shah RR, Yu Y, Shaikh AD, Tang S, Zimmermann R (2014) Atlas: automatic temporal segmentation and annotation of lecture videos based on modelling transition time. In: Proceedings of the 22nd ACM international conference on multimedia, pp 209\u2013212","DOI":"10.1145\/2647868.2656407"},{"key":"17654_CR19","doi-asserted-by":"crossref","unstructured":"Che X, Yang H, Meinel C (2013) Lecture video segmentation by automatically analyzing the synchronized slides. In: Proceedings of the 21st ACM international conference on multimedia, pp 345\u2013348","DOI":"10.1145\/2502081.2508115"},{"key":"17654_CR20","doi-asserted-by":"crossref","unstructured":"Soares ER, Barr\u00e9re E (2019) An optimization model for temporal video lecture segmentation using word2vec and acoustic features. In: Proceedings of the 25th brazillian symposium on multimedia and the web, pp 513\u2013520","DOI":"10.1145\/3323503.3349548"},{"key":"17654_CR21","doi-asserted-by":"crossref","unstructured":"Zhao B, Lin S, Luo X, Xu S, Wang R (2017) A novel system for visual navigation of educational videos using multimodal cues. In: Proceedings of the 25th ACM International Conference on Multimedia, pp 1680\u20131688","DOI":"10.1145\/3123266.3123406"},{"key":"17654_CR22","doi-asserted-by":"crossref","unstructured":"Gupta R, Roy A, Christensen C, Kim S, Gerard S, Cincebeaux M, Divakaran A, Grindal T, Shah M (2023) Class prototypes based contrastive learning for classifying multi-label and fine-grained educational videos. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 19923\u201319933","DOI":"10.1109\/CVPR52729.2023.01908"},{"key":"17654_CR23","doi-asserted-by":"crossref","unstructured":"Croitoru I, Bogolin S-V, Albanie S, Liu Y, Wang Z, Yoon S, Dernoncourt F, Jin H, Bui T (2023) Moment detection in long tutorial videos. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 2594\u20132604","DOI":"10.1109\/ICCV51070.2023.00245"},{"key":"17654_CR24","doi-asserted-by":"crossref","unstructured":"Lee DW, Ahuja C, Liang PP, Natu S, Morency L-P (2022) Multimodal lecture presentations dataset: Understanding multimodality in educational slides. arXiv:2208.08080","DOI":"10.1109\/ICCV51070.2023.01838"},{"key":"17654_CR25","unstructured":"Ghauri JA, Hakimov S, Ewerth R (2020) Classification of important segments in educational videos using multimodal features. arXiv:2010.13626"},{"key":"17654_CR26","doi-asserted-by":"crossref","unstructured":"Zhong Y, Ji W, Xiao J, Li Y, Deng W, Chua T-S (2022) Video question answering: datasets, algorithms and challenges. arXiv:2203.01225","DOI":"10.18653\/v1\/2022.emnlp-main.432"},{"key":"17654_CR27","doi-asserted-by":"crossref","unstructured":"Ge Y, Ge Y, Liu X, Li D, Shan Y, Qie X, Luo P (2022) Bridging video-text retrieval with multiple choice questions. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16167\u201316176","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"17654_CR28","doi-asserted-by":"crossref","unstructured":"Yang A, Miech A, Sivic J, Laptev I, Schmid C (2022) Tubedetr: Spatio-temporal video grounding with transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16442\u201316453","DOI":"10.1109\/CVPR52688.2022.01595"},{"key":"17654_CR29","unstructured":"Awad G, Butt AA, Fiscus J, Joy D, Delgado A, Mcclinton W, Michel M, Smeaton AF, Graham Y, Kraaij W (2017) Trecvid 2017: evaluating ad-hoc and instance video search, events detection, video captioning, and hyperlinking. In: TREC video retrieval evaluation (TRECVID)"},{"key":"17654_CR30","unstructured":"Singer U, Polyak A, Hayes T, Yin X, An J, Zhang S, Hu Q, Yang H, Ashual O, Gafni O, et al (2022) Make-a-video: Text-to-video generation without text-video data. arXiv:2209.14792"},{"key":"17654_CR31","doi-asserted-by":"crossref","unstructured":"Liu Z, Zhao F, Zhang M (2022) Multi-modal transformer for video retrieval using improved sentence embeddings. In: Fourteenth international conference on digital image processing (ICDIP 2022), vol. 12342, pp 601\u2013607. SPIE","DOI":"10.1117\/12.2643741"},{"issue":"9","key":"17654_CR32","doi-asserted-by":"publisher","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","volume":"19","author":"L Gao","year":"2017","unstructured":"Gao L, Guo Z, Zhang H, Xu X, Shen HT (2017) Video captioning with attention-based lstm and semantic consistency. IEEE Trans Multimed 19(9):2045\u20132055","journal-title":"IEEE Trans Multimed"},{"key":"17654_CR33","doi-asserted-by":"crossref","unstructured":"Le TM, Le V, Venkatesh S, Tran T (2020) Hierarchical conditional relation networks for video question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9972\u20139981","DOI":"10.1109\/CVPR42600.2020.00999"},{"issue":"12","key":"17654_CR34","doi-asserted-by":"publisher","first-page":"9073","DOI":"10.1109\/TPAMI.2021.3120745","volume":"44","author":"S Zhang","year":"2021","unstructured":"Zhang S, Peng H, Fu J, Lu Y, Luo J (2021) Multi-scale 2d temporal adjacency networks for moment localization with natural language. IEEE Trans Pattern Anal Mach Intell 44(12):9073\u20139087","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"17654_CR35","doi-asserted-by":"crossref","unstructured":"Miech A, Alayrac J-B, Smaira L, Laptev I, Sivic J, Zisserman A (2020) End-to-end learning of visual representations from uncurated instructional videos. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9879\u20139889","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"17654_CR36","doi-asserted-by":"crossref","unstructured":"Lei J, Li L, Zhou L, Gan Z, Berg TL, Bansal M, Liu J (2021) Less is more: Clipbert for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 7331\u20137341","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"17654_CR37","doi-asserted-by":"crossref","unstructured":"Mei X, Liu X, Sun J, Plumbley MD, Wang W (2022) On metric learning for audio-text cross-modal retrieval. arXiv:2203.15537","DOI":"10.21437\/Interspeech.2022-11115"},{"key":"17654_CR38","doi-asserted-by":"crossref","unstructured":"Liu X, Mei X, Huang Q, Sun J, Zhao J, Liu H, Plumbley MD, Kilic V, Wang W (2022) Leveraging pre-trained bert for audio captioning. In: 2022 30th European Signal Processing Conference (EUSIPCO), pp 1145\u20131149. IEEE","DOI":"10.23919\/EUSIPCO55093.2022.9909761"},{"key":"17654_CR39","doi-asserted-by":"crossref","unstructured":"Aldausari N, Sowmya A, Marcus N, Mohammadi G (2022) Cascaded siamese self-supervised audio to video gan. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4691\u20134700","DOI":"10.1109\/CVPRW56347.2022.00515"},{"key":"17654_CR40","unstructured":"Iashin V, Rahtu E (2021) Taming visually guided sound generation. arXiv:2110.08791"},{"key":"17654_CR41","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2018) Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805"},{"key":"17654_CR42","unstructured":"Mikolov T, Chen K, Corrado G, Dean J (2013) Efficient estimation of word representations in vector space. arXiv:1301.3781"},{"key":"17654_CR43","doi-asserted-by":"crossref","unstructured":"Koshorek O, Cohen A, Mor N, Rotman M, Berant J (2018) Text segmentation as a supervised learning task. arXiv:1803.09337","DOI":"10.18653\/v1\/N18-2075"},{"key":"17654_CR44","doi-asserted-by":"crossref","unstructured":"Berlage O, Lux K-M, Graus D (2020) Improving automated segmentation of radio shows with audio embeddings. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 751\u2013755. IEEE","DOI":"10.1109\/ICASSP40776.2020.9054315"},{"key":"17654_CR45","doi-asserted-by":"crossref","unstructured":"Hershey S, Chaudhuri S, Ellis DP, Gemmeke JF, Jansen A, Moore RC, Plakal M, Platt D, Saurous RA, Seybold B (2017) Cnn architectures for large-scale audio classification. In: 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 131\u2013135. IEEE","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"17654_CR46","doi-asserted-by":"crossref","unstructured":"Gemmeke JF, Ellis DP, Freedman D, Jansen A, Lawrence W, Moore RC, Plakal M, Ritter M (2017) Audio set: An ontology and human-labeled dataset for audio events. In: 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 776\u2013780. IEEE","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"17654_CR47","unstructured":"Chen Y (2015) Convolutional neural network for sentence classification. Master\u2019s thesis, University of Waterloo"},{"key":"17654_CR48","doi-asserted-by":"crossref","unstructured":"Munro J, Damen D (2020) Multi-modal domain adaptation for fine-grained action recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 122\u2013132","DOI":"10.1109\/CVPR42600.2020.00020"},{"key":"17654_CR49","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"17654_CR50","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Advances in neural information processing systems, pp 5998\u20136008"},{"key":"17654_CR51","doi-asserted-by":"crossref","unstructured":"Cho K, Van\u00a0Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using rnn encoder-decoder for statistical machine translation. arXiv:1406.1078","DOI":"10.3115\/v1\/D14-1179"},{"key":"17654_CR52","unstructured":"Katharopoulos A, Vyas A, Pappas N, Fleuret F (2020) Transformers are rnns: Fast autoregressive transformers with linear attention. In: International conference on machine learning, pp 5156\u20135165. PMLR"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-023-17654-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-023-17654-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-023-17654-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,13]],"date-time":"2024-12-13T10:10:46Z","timestamp":1734084646000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-023-17654-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,20]]},"references-count":52,"journal-issue":{"issue":"40","published-online":{"date-parts":[[2024,12]]}},"alternative-id":["17654"],"URL":"https:\/\/doi.org\/10.1007\/s11042-023-17654-2","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,11,20]]},"assertion":[{"value":"10 April 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 October 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 November 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 November 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"On behalf of all authors, the corresponding author states that there is no conflict of interest","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}