{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T15:13:03Z","timestamp":1776352383330,"version":"3.51.2"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,4,12]],"date-time":"2025-04-12T00:00:00Z","timestamp":1744416000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,12]],"date-time":"2025-04-12T00:00:00Z","timestamp":1744416000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1007\/s00138-025-01677-w","type":"journal-article","created":{"date-parts":[[2025,4,12]],"date-time":"2025-04-12T10:05:08Z","timestamp":1744452308000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["TFF-temporal fusion framework for advancing video retrieval through long-range dependencies and multi-modal intent"],"prefix":"10.1007","volume":"36","author":[{"given":"Pratibha","family":"Singh","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kashvi","family":"Chakrawal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alok Kumar Singh","family":"Kushwaha","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,4,12]]},"reference":[{"key":"1677_CR1","doi-asserted-by":"publisher","first-page":"1204","DOI":"10.1109\/TIP.2022.3140611","volume":"31","author":"X Yang","year":"2022","unstructured":"Yang, X., Wang, S., Dong, J., Dong, J., Wang, M., Chua, T.S.: Video moment retrieval with cross-modal neural architecture search. IEEE Trans. Image Process. 31, 1204\u20131216 (2022). https:\/\/doi.org\/10.1109\/TIP.2022.3140611","journal-title":"IEEE Trans. Image Process."},{"key":"1677_CR2","doi-asserted-by":"crossref","unstructured":"Ji, W., Liang, R., Zheng, Z., Zhang, W., Zhang, S., Li, J., Li, M., Chua, T.S.: Are binary annotations sufficient? Video Moment Retrieval via Hierarchical Uncertainty-based Active Learning (2023)","DOI":"10.1109\/CVPR52729.2023.02204"},{"key":"1677_CR3","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: Tall: temporal activity localization via language query. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"1677_CR4","doi-asserted-by":"crossref","unstructured":"Anne Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: IEEE\/CVF International Conference on Computer Vision. IEEE, Pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"1677_CR5","unstructured":"Escorcia, V., Soldan, M., Sivic, J., Ghanem, B., Russell, B.: Temporal localization of moments in video collections with natural language. arXiv preprint arXiv:1907.12763 (2019)"},{"key":"1677_CR6","doi-asserted-by":"crossref","unstructured":"Huang, K.C., Wu, T.H., Su, H.T., Hsu, W.H.: Monocular 3d object detection with depth-aware transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4012\u20134021 (2022)","DOI":"10.1109\/CVPR52688.2022.00398"},{"key":"1677_CR7","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"1677_CR8","unstructured":"Kay, W., Carreira, J., Simonyan, K., Zhang, B.: The kinetics human action video dataset technical report. Arxiv (2017)"},{"key":"1677_CR9","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Ma, L., Zhu, W.: Sentence specified dynamic video thumbnail generation. In: ACM MM (2019)","DOI":"10.1145\/3343031.3350985"},{"key":"1677_CR10","unstructured":"Radford, A., Kim, J.W., Hallacy, C.: Learning transferable visual models from natural language super VISION. In: Proceedings of the International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"1677_CR11","first-page":"11846","volume":"34","author":"J Lei","year":"2021","unstructured":"Lei, J., Berg, T.L., Bansal, M.: QVHighlights: detecting moments and highlights in videos via natural language queries. Adv. Neural Inf. Process. 34, 11846\u201311858 (2021)","journal-title":"Adv. Neural Inf. Process."},{"key":"1677_CR12","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. In: Advances in Neural Information Processing System. In: neurlPS (2016)"},{"key":"1677_CR13","doi-asserted-by":"crossref","unstructured":"Bello, I., Zoph, B., Vaswani, A., Shlens, J., Le, Q.V.: Attention augmented convolutional networks. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00338"},{"key":"1677_CR14","doi-asserted-by":"crossref","unstructured":"Song, Y., Vallmitjana, J., Stent, A., Jaimes, A.: Tvsum: summarizing web videos using titles. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 5179\u20135187 (2016)","DOI":"10.1109\/CVPR.2015.7299154"},{"key":"1677_CR15","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: a metric and a loss for bounding box regression. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"1677_CR16","doi-asserted-by":"crossref","unstructured":"Liu, Y., Li, S., Wu, Y., Chen, C. W., Shan, Y., Qie, X.: UMT: unified multi-modal transformers for joint video moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"1677_CR17","doi-asserted-by":"crossref","unstructured":"Lei, J., Yu, L., Berg, T.L., Bansal, M.: Tvr: a large-scale dataset for video subtitle moment retrieval. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"1677_CR18","doi-asserted-by":"crossref","unstructured":"Chen, S., Jiang, Y.G.: Semantic proposal for activity localization in videos via sentence query. In: AAAI Conference on Artificial Intelligence. AAAI, pp. 8199\u20138206 (2019)","DOI":"10.1609\/aaai.v33i01.33018199"},{"key":"1677_CR19","doi-asserted-by":"crossref","unstructured":"Mahasseni, B., Lam, M., Todorovic, S.: Unsupervised video summarization with adversarial lstm network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 202\u2013211: CVPR (2017)","DOI":"10.1109\/CVPR.2017.318"},{"key":"1677_CR20","doi-asserted-by":"crossref","unstructured":"Hong, F.T., Huang, X., Li, W.H., Zheng, W.S.: Mini-net: Multiple instance ranking network for video highlight detection. In: ECCV, pp. 345\u2013360 (2020)","DOI":"10.1007\/978-3-030-58601-0_21"},{"issue":"22","key":"1677_CR21","doi-asserted-by":"publisher","first-page":"e40142","DOI":"10.1016\/j.heliyon.2024.e40142","volume":"10","author":"A Farooq","year":"2024","unstructured":"Farooq, A., Uddin, M.I., Adnan, M., Alarood, A.A., Alsolami, E., Habibullah, S.: Interpretable multi-horizon time series forecasting of cryptocurrencies by leveraging temporal fusion transformer. Heliyon 10(22), e40142 (2024). https:\/\/doi.org\/10.1016\/j.heliyon.2024.e40142","journal-title":"Heliyon"},{"key":"1677_CR22","doi-asserted-by":"crossref","unstructured":"Wang, W., Huang, Y., Wang, L.: Language driven temporal activity localization: a semantic matching reinforcement learning model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern. CVPR, pp. 334\u2013343 ( 2019)","DOI":"10.1109\/CVPR.2019.00042"},{"key":"1677_CR23","doi-asserted-by":"crossref","unstructured":"Sun, M., Farhadi, A., Seitz, S.: Ranking domain specific highlights by analyzing edited videos. In: European Conference on Computer Vision. ECCV, pp. 787\u2013802 (2016)","DOI":"10.1007\/978-3-319-10590-1_51"},{"key":"1677_CR24","unstructured":"Tang, H., Kwatra, V., Sargin, M.E., Gargi, U.: Detecting highlights in sports videos: cricket as a test case. In: Proceedings of the IEEE International Conference on multimedia and expo. ICME, pp. 1\u20136 (2018)"},{"key":"1677_CR25","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J.: Attention is all you need. In: Advances in Neural Information Processing Systems. neurlIPS, pp. 5998\u20136008 (2017)"},{"key":"1677_CR26","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on computer vision. ICCV, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"1677_CR27","unstructured":"Wang, J., Xu, C., Chng, E., Tian, Q.: Sports highlight detection from keyword sequences using hmm. In: Proceedings of the IEEE International Conference. ICME, pp. 599\u2013602 (2016)"},{"key":"1677_CR28","doi-asserted-by":"crossref","unstructured":"Kong, Q., Cao, Y., Iqbal, T., Wang, Y., Wang, W., Pann, M.P.: Large-scale pretrained audio neural networks for audio pattern recognition. In: IEEE\/ACM, pp. 2880\u20132894 (2020)","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"1677_CR29","doi-asserted-by":"crossref","unstructured":"Yao, T., Mei, T., Rui, Y.: Highlight detection with pairwise deep ranking for first-person video summarization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR. 982\u2013990 (2016)","DOI":"10.1109\/CVPR.2016.112"},{"key":"1677_CR30","doi-asserted-by":"crossref","unstructured":"Zhang, K., Chao, W.L., Sha, F., Grauman, K.: Video summarization with long short-term memory. In: European Conference on Computer Vision. Springer, pp. 766\u2013782 (2016)","DOI":"10.1007\/978-3-319-46478-7_47"},{"key":"1677_CR31","doi-asserted-by":"crossref","unstructured":"Ye, Q., Shen, X., Gao, Y., Wang, Z., Bi, Q., Li, P., Yang, G.: Temporal cue guided video highlight detection with low-rank audio-visual fusion. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7950\u20137959 (2021)","DOI":"10.1109\/ICCV48922.2021.00785"},{"key":"1677_CR32","doi-asserted-by":"crossref","unstructured":"Tellex, S., Roy, D.: Towards surveillance video search by natural language query. In: Proceedings of the ACM International Conference on Image and Video Retrieval. CIVR, pp. 1\u20138 (2019)","DOI":"10.1145\/1646396.1646442"},{"key":"1677_CR33","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis. action recognition? A new model and the kinetics dataset. In: IEEE\/CVF Conference on Computer Vision. CVPR, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"1677_CR34","doi-asserted-by":"crossref","unstructured":"Badamdorj, T., Rochan, M., Wang, Y., Cheng, L.: Joint visual and audio learning for video highlight detection. ICCV. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8127\u20138137 (2021)","DOI":"10.1109\/ICCV48922.2021.00802"},{"key":"1677_CR35","doi-asserted-by":"publisher","first-page":"121897","DOI":"10.1016\/j.eswa.2024.121897","volume":"238","author":"L de Azevedo Takara","year":"2024","unstructured":"de Azevedo Takara, L., Santos, A.A.P., Mariani, V.C., dos Santos Coelho, L.: Deep reinforcement learning applied to a sparse-reward trading environment with intraday data. Expert Syst. Appl. 238, 121897 (2024). https:\/\/doi.org\/10.1016\/j.eswa.2024.121897","journal-title":"Expert Syst. Appl."},{"key":"1677_CR36","doi-asserted-by":"crossref","unstructured":"Gao, J., Xu, C.: Fast video moment retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1523\u20131532 (2021)","DOI":"10.1109\/ICCV48922.2021.00155"},{"key":"1677_CR37","doi-asserted-by":"crossref","unstructured":"Law, H., Deng, J.: Cornernet: detecting objects as paired keypoints. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 734\u2013750 (2018)","DOI":"10.1007\/978-3-030-01264-9_45"},{"key":"1677_CR38","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., Ellis, D.P., Freedman, D., Jansen, A.: Audio set: an ontology and human-labeled dataset for audio events. In: International Conference on Acoustics, Speech and Signal. ICASSP, pp. 776\u2013780 (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"1677_CR39","unstructured":"Zhou, X., Wang, D., Kr\u00e4henb\u00fchl, P. Objects as points. In: Proceedings of the IEEE\/CVF Conference. CVPR (2019)"},{"key":"1677_CR40","doi-asserted-by":"crossref","unstructured":"Lin, Z., Zhao, Z., Zhang, Z., Wang, Q., Liu, H.: Weakly-supervised video moment retrieval via semantic completion network. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34 (2020)","DOI":"10.1609\/aaai.v34i07.6820"},{"key":"1677_CR41","unstructured":"Ba, D., Kingma, P.: Adam a method for stochastic optimization. In: Proceedings of the International Conference on Learning Representations. In: ICLR (2017)"},{"key":"1677_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, D., Dai, X., Wang, X., Wang, Y.F., Davis, L.S.: Man: MoMan: moment alignment network for natural language moment retrieval via iterative graph adjustment. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR. 1247\u20131257 (2019)","DOI":"10.1109\/CVPR.2019.00134"},{"key":"1677_CR43","doi-asserted-by":"crossref","unstructured":"Badamdorj, T., Rochan, M., Wang, Y., Cheng, L.: Contrastive learning for unsupervised video highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022).","DOI":"10.1109\/CVPR52688.2022.01365"},{"key":"1677_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, S., Peng, H., Fu, J. and Luo, J.: Learning 2d temporal adjacent networks for moment localization with natural language. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 12870\u201312877 (2020)","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"1677_CR45","doi-asserted-by":"crossref","unstructured":"Xiong, B., Kalantidis, Y., Ghadiyaram, D., Grauman, K.: Less is more: learning highlight detection from video duration. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1258\u20131267 (2019)","DOI":"10.1109\/CVPR.2019.00135"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-025-01677-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00138-025-01677-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-025-01677-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,9]],"date-time":"2025-05-09T14:31:25Z","timestamp":1746801085000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00138-025-01677-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,12]]},"references-count":45,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,5]]}},"alternative-id":["1677"],"URL":"https:\/\/doi.org\/10.1007\/s00138-025-01677-w","relation":{},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"value":"0932-8092","type":"print"},{"value":"1432-1769","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,12]]},"assertion":[{"value":"1 December 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 January 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 March 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 April 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"67"}}