{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T05:31:01Z","timestamp":1732771861306,"version":"3.29.0"},"reference-count":48,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"9","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2024,9,1]]},"DOI":"10.1587\/transinf.2024edp7031","type":"journal-article","created":{"date-parts":[[2024,8,31]],"date-time":"2024-08-31T22:19:10Z","timestamp":1725142750000},"page":"1253-1263","source":"Crossref","is-referenced-by-count":0,"title":["TIG: A Multitask Temporal Interval Guided Framework for Key Frame Detection"],"prefix":"10.1587","volume":"E107.D","author":[{"given":"Shijie","family":"WANG","sequence":"first","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuejiao","family":"HU","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sheng","family":"LIU","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ming","family":"LI","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yang","family":"LI","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sidan","family":"DU","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"doi-asserted-by":"crossref","unstructured":"[1] I.S. Kwak, J.-Z. Guo, A. Hantman, K. Branson, and D. Kriegman, \u201cDetecting the starting frame of actions in video,\u201d 2020 IEEE Winter Conference on Applications of Computer Vision (WACV), pp.478-486, 2020. 10.1109\/wacv45572.2020.9093405","key":"1","DOI":"10.1109\/WACV45572.2020.9093405"},{"doi-asserted-by":"publisher","unstructured":"[2] G. Yasmin, S. Chowdhury, J. Nayak, P. Das, and A.K. Das, \u201cKey moment extraction for designing an agglomerative clustering algorithm-based video summarization framework,\u201d Neural Computing and Applications, vol.35, no.7, pp.4881-4902, 2023. 10.1007\/s00521-021-06132-1","key":"2","DOI":"10.1007\/s00521-021-06132-1"},{"doi-asserted-by":"crossref","unstructured":"[3] T. Bhattacharjee, S. Saha, A. Konar, and A.K. Nagar, \u201cStatic video summarization using artificial bee colony optimization,\u201d 2018 IEEE Symposium Series on Computational Intelligence (SSCI), pp.777-784, 2018. 10.1109\/ssci.2018.8628784","key":"3","DOI":"10.1109\/SSCI.2018.8628784"},{"doi-asserted-by":"crossref","unstructured":"[4] Z. Shou, J. Pan, J. Chan, K. Miyazawa, H. Mansour, A. Vetro, X. Giro-i-Nieto, and S.-F. Chang, \u201cOnline detection of action start in untrimmed, streaming videos,\u201d Proc. European Conference on Computer Vision (ECCV), vol.11207, pp.534-551, Sept. 2018. 10.1007\/978-3-030-01219-9_33","key":"4","DOI":"10.1007\/978-3-030-01219-9_33"},{"doi-asserted-by":"crossref","unstructured":"[5] M. Gao, M. Xu, L. Davis, R. Socher, and C. Xiong, \u201cStartnet: Online detection of action start in untrimmed videos,\u201d 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp.5541-5550, 2019. 10.1109\/iccv.2019.00564","key":"5","DOI":"10.1109\/ICCV.2019.00564"},{"doi-asserted-by":"crossref","unstructured":"[6] H. Eun, J. Moon, J. Park, C. Jung, and C. Kim, \u201cLearning to discriminate information for online action detection,\u201d 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp.806-815, 2020. 10.1109\/cvpr42600.2020.00089","key":"6","DOI":"10.1109\/CVPR42600.2020.00089"},{"doi-asserted-by":"crossref","unstructured":"[7] R. De Geest and T. Tuytelaars, \u201cModeling temporal structure with LSTM for online action detection,\u201d 2018 IEEE Winter Conference on Applications of Computer Vision (WACV), pp.1549-1557, 2018. 10.1109\/wacv.2018.00173","key":"7","DOI":"10.1109\/WACV.2018.00173"},{"unstructured":"[8] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A.N. Gomez, \u0141. Kaiser, and I. Polosukhin, \u201cAttention is all you need,\u201d Advances in Neural Information Processing Systems, vol.30, 2017.","key":"8"},{"doi-asserted-by":"crossref","unstructured":"[9] F. Cheng and G. Bertasius, \u201cTallformer: Temporal action localization with a long-memory transformer,\u201d European Conference on Computer Vision (ECCV), vol.13694, pp.503-521, 2022. 10.1007\/978-3-031-19830-4_29","key":"9","DOI":"10.1007\/978-3-031-19830-4_29"},{"doi-asserted-by":"crossref","unstructured":"[10] C.-L. Zhang, J. Wu, and Y. Li, \u201cActionformer: Localizing moments of actions with transformers,\u201d European Conference on Computer Vision (ECCV), vol.13664, pp.492-510, 2022. 10.1007\/978-3-031-19772-7_29","key":"10","DOI":"10.1007\/978-3-031-19772-7_29"},{"doi-asserted-by":"crossref","unstructured":"[11] J. An, H. Kang, S.H. Han, M.-H. Yang, and S.J. Kim, \u201cMiniroad: Minimal RNN framework for online action detection,\u201d 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp.10307-10316, 2023. 10.1109\/iccv51070.2023.00949","key":"11","DOI":"10.1109\/ICCV51070.2023.00949"},{"doi-asserted-by":"crossref","unstructured":"[12] N. Bodla, B. Singh, R. Chellappa, and L.S. Davis, \u201cSoft-nms \u2014 improving object detection with one line of code,\u201d 2017 IEEE International Conference on Computer Vision (ICCV), pp.5562-5570, 2017. 10.1109\/iccv.2017.593","key":"12","DOI":"10.1109\/ICCV.2017.593"},{"doi-asserted-by":"crossref","unstructured":"[13] T. Lin, X. Zhao, H. Su, C. Wang, and M. Yang, \u201cBsn: Boundary sensitive network for temporal action proposal generation,\u201d European Conference on Computer Vision (ECCV), Cham, vol.11208, pp.3-21, 2018. 10.1007\/978-3-030-01225-0_1","key":"13","DOI":"10.1007\/978-3-030-01225-0_1"},{"doi-asserted-by":"crossref","unstructured":"[14] J. Tan, J. Tang, L. Wang, and G. Wu, \u201cRelaxed transformer decoders for direct action proposal generation,\u201d Proc. IEEE\/CVF International Conference on Computer Vision (ICCV), pp.13526-13535, Oct. 2021. 10.1109\/iccv48922.2021.01327","key":"14","DOI":"10.1109\/ICCV48922.2021.01327"},{"doi-asserted-by":"publisher","unstructured":"[15] R. Zeng, W. Huang, M. Tan, Y. Rong, P. Zhao, J. Huang, and C. Gan, \u201cGraph convolutional module for temporal action localization in videos,\u201d IEEE Trans. Pattern Anal. Mach. Intell., vol.44, no.10, pp.6209-6223, 2022. 10.1109\/tpami.2021.3090167","key":"15","DOI":"10.1109\/TPAMI.2021.3090167"},{"doi-asserted-by":"crossref","unstructured":"[16] C. Lin, C. Xu, D. Luo, Y. Wang, Y. Tai, C. Wang, J. Li, F. Huang, and Y. Fu, \u201cLearning salient boundary feature for anchor-free temporal action localization,\u201d 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp.3319-3328, 2021. 10.1109\/cvpr46437.2021.00333","key":"16","DOI":"10.1109\/CVPR46437.2021.00333"},{"doi-asserted-by":"crossref","unstructured":"[17] X. Liu, S. Bai, and X. Bai, \u201cAn empirical study of end-to-end temporal action detection,\u201d 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp.19978-19987, 2022. 10.1109\/cvpr52688.2022.01938","key":"17","DOI":"10.1109\/CVPR52688.2022.01938"},{"doi-asserted-by":"publisher","unstructured":"[18] Y. Chen, B. Guo, Y. Shen, W. Wang, W. Lu, and X. Suo, \u201cCapsule boundary network with 3d convolutional dynamic routing for temporal action detection,\u201d IEEE Trans. Circuits Syst. Video Technol., vol.32, no.5, pp.2962-2975, 2022. 10.1109\/tcsvt.2021.3104226","key":"18","DOI":"10.1109\/TCSVT.2021.3104226"},{"doi-asserted-by":"crossref","unstructured":"[19] Z. Shou, J. Chan, A. Zareian, K. Miyazawa, and S.-F. Chang, \u201cCdc: Convolutional-de-convolutional networks for precise temporal action localization in untrimmed videos,\u201d 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp.1417-1426, 2017. 10.1109\/cvpr.2017.155","key":"19","DOI":"10.1109\/CVPR.2017.155"},{"doi-asserted-by":"publisher","unstructured":"[20] B. Li, R. Liu, T. Chen, and Y. Zhu, \u201cWeakly supervised temporal action detection with temporal dependency learning,\u201d IEEE Trans. Circuits Syst. Video Technol., vol.32, no.7, pp.4473-4485, 2022. 10.1109\/tcsvt.2021.3125701","key":"20","DOI":"10.1109\/TCSVT.2021.3125701"},{"doi-asserted-by":"crossref","unstructured":"[21] R. De Geest, E. Gavves, A. Ghodrati, Z. Li, C. Snoek, and T. Tuytelaars, \u201cOnline action detection,\u201d European Conference on Computer Vision (ECCV), Cham, vol.9909, pp.269-284, 2016. 10.1007\/978-3-319-46454-1_17","key":"21","DOI":"10.1007\/978-3-319-46454-1_17"},{"doi-asserted-by":"publisher","unstructured":"[22] J. Huang, N. Li, T. Li, S. Liu, and G. Li, \u201cSpatial-temporal context-aware online action detection and prediction,\u201d IEEE Trans. Circuits Syst. Video Technol., vol.30, no.8, pp.2650-2662, 2020. 10.1109\/tcsvt.2019.2923712","key":"22","DOI":"10.1109\/TCSVT.2019.2923712"},{"doi-asserted-by":"publisher","unstructured":"[23] J. Liu, Y. Li, S. Song, J. Xing, C. Lan, and W. Zeng, \u201cMulti-modality multi-task recurrent neural network for online action detection,\u201d IEEE Trans. Circuits Syst. Video Technol., vol.29, no.9, pp.2667-2682, 2019. 10.1109\/tcsvt.2018.2799968","key":"23","DOI":"10.1109\/TCSVT.2018.2799968"},{"doi-asserted-by":"crossref","unstructured":"[24] X. Wang, S. Zhang, Z. Qing, Y. Shao, Z. Zuo, C. Gao, and N. Sang, \u201cOadtr: Online action detection with transformers,\u201d IEEE\/CVF International Conference on Computer Vision (CVPR), pp.7565-7575, 2021. 10.1109\/iccv48922.2021.00747","key":"24","DOI":"10.1109\/ICCV48922.2021.00747"},{"unstructured":"[25] M. Xu, Y. Xiong, H. Chen, X. Li, W. Xia, Z. Tu, and S. Soatto, \u201cLong short-term transformer for online action detection,\u201d Advances in Neural Information Processing Systems, pp.1086-1099, 2021.","key":"25"},{"doi-asserted-by":"crossref","unstructured":"[26] S. Cao, W. Luo, B. Wang, W. Zhang, and L. Ma, \u201cE2e-load: End-to-end long-form online action detection,\u201d 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp.10388-10398, 2023. 10.1109\/iccv51070.2023.00956","key":"26","DOI":"10.1109\/ICCV51070.2023.00956"},{"doi-asserted-by":"publisher","unstructured":"[27] T. Wang, Y. Chen, H. Lv, J. Teng, H. Snoussi, and F. Tao, \u201cOnline detection of action start via soft computing for smart city,\u201d IEEE Trans. Ind. Informat., vol.17, no.1, pp.524-533, 2021. 10.1109\/tii.2020.2997032","key":"27","DOI":"10.1109\/TII.2020.2997032"},{"doi-asserted-by":"publisher","unstructured":"[28] X. Hu, S. Wang, M. Li, Y. Li, and S. Du, \u201cDistribution-aware activity boundary representation for online detection of action start in untrimmed videos,\u201d IEEE Signal Process. Lett., vol.31, pp.765-769, 2024. 10.1109\/lsp.2024.3352503","key":"28","DOI":"10.1109\/LSP.2024.3352503"},{"doi-asserted-by":"crossref","unstructured":"[29] E. Apostolidis, G. Balaouras, V. Mezaris, and I. Patras, \u201cCombining global and local attention with positional encoding for video summarization,\u201d 2021 IEEE International Symposium on Multimedia (ISM), pp.226-234, 2021. 10.1109\/ism52913.2021.00045","key":"29","DOI":"10.1109\/ISM52913.2021.00045"},{"doi-asserted-by":"crossref","unstructured":"[30] J.A. Ghauri, S. Hakimov, and R. Ewerth, \u201cSupervised video summarization via multiple feature sets with parallel attention,\u201d 2021 IEEE International Conference on Multimedia and Expo (ICME), pp.1-6s, 2021. 10.1109\/icme51207.2021.9428318","key":"30","DOI":"10.1109\/ICME51207.2021.9428318"},{"doi-asserted-by":"publisher","unstructured":"[31] Y. Jung, D. Cho, D. Kim, S. Woo, and I.S. Kweon, \u201cDiscriminative feature learning for unsupervised video summarization,\u201d Proc. AAAI Conference on Artificial Intelligence, vol.33, no.1, pp.8537-8544, 2019. 10.1609\/aaai.v33i01.33018537","key":"31","DOI":"10.1609\/aaai.v33i01.33018537"},{"doi-asserted-by":"publisher","unstructured":"[32] Y. Yuan and J. Zhang, \u201cUnsupervised video summarization via deep reinforcement learning with shot-level semantics,\u201d IEEE Trans. Circuits Syst. Video Technol., vol.33, no.1, pp.445-456, 2023. 10.1109\/tcsvt.2022.3197819","key":"32","DOI":"10.1109\/TCSVT.2022.3197819"},{"doi-asserted-by":"crossref","unstructured":"[33] Y. Jung, D. Cho, S. Woo, and I.S. Kweon, \u201cGlobal-and-local relative position embedding for unsupervised video summarization,\u201d European Conference on Computer Vision (ECCV), vol.12370, pp.167-183, 2020. 10.1007\/978-3-030-58595-2_11","key":"33","DOI":"10.1007\/978-3-030-58595-2_11"},{"doi-asserted-by":"crossref","unstructured":"[34] B. He, J. Wang, J. Qiu, T. Bui, A. Shrivastava, and Z. Wang, \u201cAlign and attend: Multimodal summarization with dual contrastive losses,\u201d 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp.14867-14878, 2023. 10.1109\/cvpr52729.2023.01428","key":"34","DOI":"10.1109\/CVPR52729.2023.01428"},{"unstructured":"[35] Y.G. Jiang, J. Liu, A. Roshan Zamir, G. Toderici, I. Laptev, M. Shah, and R. Sukthankar, \u201cTHUMOS challenge: Action recognition with a large number of classes,\u201d http:\/\/crcv.ucf.edu\/THUMOS14\/, 2014.","key":"35"},{"doi-asserted-by":"crossref","unstructured":"[36] F.C. Heilbron, V. Escorcia, B. Ghanem, and J.C. Niebles, \u201cActivitynet: A large-scale video benchmark for human activity understanding,\u201d 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp.961-970, 2015. 10.1109\/cvpr.2015.7298698","key":"36","DOI":"10.1109\/CVPR.2015.7298698"},{"doi-asserted-by":"crossref","unstructured":"[37] Y. Song, J. Vallmitjana, A. Stent, and A. Jaimes, \u201cTvsum: Summarizing web videos using titles,\u201d 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp.5179-5187, 2015. 10.1109\/cvpr.2015.7299154","key":"37","DOI":"10.1109\/CVPR.2015.7299154"},{"doi-asserted-by":"crossref","unstructured":"[38] M. Gygli, H. Grabner, H. Riemenschneider, and L.V. Gool, \u201cCreating summaries from user videos,\u201d European Conference on Computer Vision (ECCV), vol.8695, pp.505-520, 2014. 10.1007\/978-3-319-10584-0_33","key":"38","DOI":"10.1007\/978-3-319-10584-0_33"},{"doi-asserted-by":"crossref","unstructured":"[39] M. Otani, Y. Nakashima, E. Rahtu, and J. Heikkil\u00e4, \u201cRethinking the evaluation of video summaries,\u201d 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp.7588-7596, 2019. 10.1109\/cvpr.2019.00778","key":"39","DOI":"10.1109\/CVPR.2019.00778"},{"doi-asserted-by":"crossref","unstructured":"[40] K. Zhang, W.-L. Chao, F. Sha, and K. Grauman, \u201cVideo summarization with long short-term memory,\u201d European Conference on Computer Vision (ECCV), Cham, vol.9911, pp.766-782, 2016. 10.1007\/978-3-319-46478-7_47","key":"40","DOI":"10.1007\/978-3-319-46478-7_47"},{"doi-asserted-by":"crossref","unstructured":"[41] M. Xu, M. Gao, Y.-T. Chen, L. Davis, and D. Crandall, \u201cTemporal recurrent networks for online action detection,\u201d 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp.5531-5540, 2019. 10.1109\/iccv.2019.00563","key":"41","DOI":"10.1109\/ICCV.2019.00563"},{"doi-asserted-by":"crossref","unstructured":"[42] Y. Zhao and P. Kr\u00e4henb\u00fchl, \u201cReal-time online video detection with temporal smoothing transformers,\u201d European Conference on Computer Vision (ECCV), vol.13694, pp.485-502, 2022. 10.1007\/978-3-031-19830-4_28","key":"42","DOI":"10.1007\/978-3-031-19830-4_28"},{"doi-asserted-by":"crossref","unstructured":"[43] L. Wang, Y. Xiong, Z. Wang, Y. Qiao, D. Lin, X. Tang, and L. Van Gool, \u201cTemporal segment networks: Towards good practices for deep action recognition,\u201d European Conference on Computer Vision (ECCV), Cham, vol.9912, pp.20-36, 2016. 10.1007\/978-3-319-46484-8_2","key":"43","DOI":"10.1007\/978-3-319-46484-8_2"},{"doi-asserted-by":"crossref","unstructured":"[44] K. He, X. Zhang, S. Ren, and J. Sun, \u201cDeep residual learning for image recognition,\u201d 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp.770-778, 2016. 10.1109\/cvpr.2016.90","key":"44","DOI":"10.1109\/CVPR.2016.90"},{"unstructured":"[45] S. Ioffe and C. Szegedy, \u201cBatch normalization: Accelerating deep network training by reducing internal covariate shift,\u201d International Conference on Machine Learning, pp.448-456, 2015.","key":"45"},{"doi-asserted-by":"crossref","unstructured":"[46] C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, and A. Rabinovich, \u201cGoing deeper with convolutions,\u201d 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp.1-9, 2015. 10.1109\/cvpr.2015.7298594","key":"46","DOI":"10.1109\/CVPR.2015.7298594"},{"unstructured":"[47] https:\/\/github.com\/Breakthrough\/PySceneDetect.","key":"47"},{"unstructured":"[48] https:\/\/github.com\/johmathe\/Shotdetect.","key":"48"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E107.D\/9\/E107.D_2024EDP7031\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T16:25:01Z","timestamp":1732724701000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E107.D\/9\/E107.D_2024EDP7031\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,1]]},"references-count":48,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2024]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2024edp7031","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"type":"print","value":"0916-8532"},{"type":"electronic","value":"1745-1361"}],"subject":[],"published":{"date-parts":[[2024,9,1]]},"article-number":"2024EDP7031"}}