{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T00:58:48Z","timestamp":1775869128026,"version":"3.50.1"},"reference-count":21,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,1,25]],"date-time":"2025-01-25T00:00:00Z","timestamp":1737763200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,1,25]],"date-time":"2025-01-25T00:00:00Z","timestamp":1737763200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Discov Artif Intell"],"DOI":"10.1007\/s44163-025-00230-y","type":"journal-article","created":{"date-parts":[[2025,1,25]],"date-time":"2025-01-25T15:44:57Z","timestamp":1737819897000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Automatic summarization of cooking videos using transfer learning and transformer-based models"],"prefix":"10.1007","volume":"5","author":[{"given":"P. M. Alen","family":"Sadique","sequence":"first","affiliation":[]},{"given":"R. V.","family":"Aswiga","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,25]]},"reference":[{"key":"230_CR1","doi-asserted-by":"crossref","unstructured":"Sobue R, et al. Cooking video summarization guided by matching with step-by-step recipe photos. 2019, 16th International conference on machine vision applications (MVA). IEEE, 2019.","DOI":"10.23919\/MVA.2019.8757903"},{"key":"230_CR2","doi-asserted-by":"crossref","unstructured":"Zhou L, et al. End-to-end dense video captioning with masked transformer. Proceedings of the IEEE conference on computer vision and pattern recognition. 2018.","DOI":"10.1109\/CVPR.2018.00911"},{"key":"230_CR3","unstructured":"Nishimura T, et al. Recipe generation from unsegmented cooking videos. arXiv preprint arXiv:2209.10134. 2022."},{"key":"230_CR4","doi-asserted-by":"crossref","unstructured":"Wu J, et al. Ingredient-enriched recipe generation from cooking videos. Proceedings of the 2022 international conference on multimedia retrieval. 2022.","DOI":"10.1145\/3512527.3531388"},{"key":"230_CR5","unstructured":"Saheb S, et al. Recipes creation using food images through inverse cooking. J Algebr Stat. 2022;13(3):2391\u20136."},{"key":"230_CR6","doi-asserted-by":"crossref","unstructured":"Chhikara P, et al. FIRE: food image to recipe generation. Proceedings of the IEEE\/CVF winter conference on applications of computer vision. 2024.","DOI":"10.1109\/WACV57701.2024.00800"},{"key":"230_CR7","unstructured":"Mahal R, et al. Image-to-recipe translation using multi-model architecture. Image 7.05. 2020."},{"key":"230_CR8","doi-asserted-by":"crossref","unstructured":"Papadopoulos DP, et al. Learning program representations for food images and cooking recipes. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.01606"},{"key":"230_CR9","doi-asserted-by":"crossref","unstructured":"Khan AM, et al. Rethinking cooking state recognition with vision transformers. 2022, 25th International conference on computer and information technology (ICCIT). IEEE, 2022.","DOI":"10.1109\/ICCIT57492.2022.10055869"},{"key":"230_CR10","unstructured":"Chen J, Yin Y, Xu Y. Recipe snap\u2014a light weight image-to-recipe model. arXiv preprint arXiv:2205.02141. 2022."},{"key":"230_CR11","unstructured":"Doman K, et al. Video cooking: towards the synthesis of multimedia cooking recipes. Advances in multimedia modeling: 17th international multi-media modeling conference, MMM 2011, Taipei, Taiwan, January 5\u20137, 2011, Proceedings, Part II 17. Berlin, Heidelberg: Springer; 2011."},{"key":"230_CR12","doi-asserted-by":"crossref","unstructured":"Xu J, et al. Learning multimodal attention LSTM networks for video captioning. Proceedings of the 25th ACM international conference on multimedia. 2017.","DOI":"10.1145\/3123266.3123448"},{"key":"230_CR13","doi-asserted-by":"crossref","unstructured":"\u00d6zer EG, et al. Deep learning based, a new model for video captioning. Int J Adv Comput Sci Appl. 2020;11(3).","DOI":"10.14569\/IJACSA.2020.0110365"},{"key":"230_CR14","unstructured":"Yu Q, Mao D, Wang J. Deep learning based food recognition. Technical report, Stanford University; 2016."},{"key":"230_CR15","doi-asserted-by":"crossref","unstructured":"Fakhrou A, Kunhoth J, Al Maadeed S. Smart phone-based food recognition system using multiple deep CNN models. Multim Tools Appl. 2021;80(21):33011\u201332.","DOI":"10.1007\/s11042-021-11329-6"},{"key":"230_CR16","unstructured":"Tang Y, et al. Video understanding with large language models: a survey. arXiv preprint arXiv:2312.17432. 2023."},{"key":"230_CR17","unstructured":"Sahoo P, et al. A systematic survey of prompt engineering in large language models: techniques and applications. arXiv preprint arXiv:2402.07927. 2024."},{"issue":"1","key":"230_CR18","first-page":"25","volume":"1","author":"CM Paredes","year":"2023","unstructured":"Paredes CM, Gallardo CM, Claudio YMS. ChatGPT API: brief overview and integration in software development. Int J Eng Ins. 2023;1(1):25\u20139.","journal-title":"Int J Eng Ins"},{"issue":"13","key":"230_CR19","doi-asserted-by":"publisher","first-page":"4817","DOI":"10.3390\/s22134817","volume":"22","author":"H Im","year":"2022","unstructured":"Im H, Choi Y-S. UAT: universal attention transformer for video captioning. Sensors. 2022;22(13):4817.","journal-title":"Sensors"},{"key":"230_CR20","doi-asserted-by":"crossref","unstructured":"Zhao H, et al. Video captioning based on vision transformer and reinforcement learning. Peer J Comput Sci. 2022;8:e916.","DOI":"10.7717\/peerj-cs.916"},{"key":"230_CR21","doi-asserted-by":"crossref","unstructured":"Huang X, et al. Fusion of multi-modal features to enhance dense video caption. Sensors. 2023;23(12):5565.","DOI":"10.3390\/s23125565"}],"container-title":["Discover Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44163-025-00230-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s44163-025-00230-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44163-025-00230-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,25]],"date-time":"2025-01-25T15:45:06Z","timestamp":1737819906000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s44163-025-00230-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,25]]},"references-count":21,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["230"],"URL":"https:\/\/doi.org\/10.1007\/s44163-025-00230-y","relation":{},"ISSN":["2731-0809"],"issn-type":[{"value":"2731-0809","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1,25]]},"assertion":[{"value":"30 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 January 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"7"}}