{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,19]],"date-time":"2025-12-19T10:09:55Z","timestamp":1766138995224,"version":"3.40.3"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031784460"},{"type":"electronic","value":"9783031784477"}],"license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78447-7_25","type":"book-chapter","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:24:27Z","timestamp":1733185467000},"page":"371-386","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Large Models in\u00a0Dialogue for\u00a0Active Perception and\u00a0Anomaly Detection"],"prefix":"10.1007","author":[{"given":"Tzoulio","family":"Chamiti","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nikolaos","family":"Passalis","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Anastasios","family":"Tefas","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"25_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.comcom.2020.03.012","volume":"156","author":"B Mishra","year":"2020","unstructured":"Mishra, B., Garg, D., Narang, P., Mishra, V.: Drone-surveillance for search and rescue in natural disaster. Comput. Commun. 156, 1\u201310 (2020)","journal-title":"Comput. Commun."},{"key":"25_CR2","doi-asserted-by":"crossref","unstructured":"Chriki, A., Touati, H., Snoussi, H., Kamoun, F.: UAV-based surveillance system: an anomaly detection approach. In: Proceedings of the IEEE Symposium on Computers and Communications (ISCC), pp. 1\u20136 (2020)","DOI":"10.1109\/ISCC50000.2020.9219585"},{"key":"25_CR3","doi-asserted-by":"crossref","unstructured":"Gasparini, R., et al.: Anomaly detection, localization and classification for railway inspection. In: Proceedings of the International Conference on Pattern Recognition (ICPR), pp. 3419\u20133426 (2021)","DOI":"10.1109\/ICPR48806.2021.9412972"},{"key":"25_CR4","doi-asserted-by":"crossref","unstructured":"Zhai, X., Liu, K., Nash, W., Castineira, D.: Smart autopilot drone system for surface surveillance and anomaly detection via customizable deep neural network. In: IPTC International Petroleum Technology Conference, 14 January 2020, vol. Day 2 Tue, p. D021S053R001 (2020)","DOI":"10.2523\/IPTC-20111-MS"},{"key":"25_CR5","doi-asserted-by":"crossref","unstructured":"Unlu, E., Zenou, E., Riviere, N., Dupouy, P.E.: An autonomous drone surveillance and tracking architecture. In: 2019 Autonomous Vehicles and Machines Conference, AVM 2019, vol. 2019, pp. 35-1\u201335-7 (2019)","DOI":"10.2352\/ISSN.2470-1173.2019.15.AVM-035"},{"key":"25_CR6","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1007\/s10514-017-9615-3","volume":"42","author":"R Bajcsy","year":"2018","unstructured":"Bajcsy, R., Aloimonos, Y., Tsotsos, J.K.: Revisiting active perception. Auton. Robot. 42, 177\u2013196 (2018)","journal-title":"Auton. Robot."},{"issue":"2","key":"25_CR7","doi-asserted-by":"publisher","first-page":"2517","DOI":"10.1109\/LRA.2021.3062004","volume":"6","author":"N Saito","year":"2021","unstructured":"Saito, N., Ogata, T., Funabashi, S., Mori, H., Sugano, S.: How to select and use tools?: active perception of target objects using multimodal deep learning. IEEE Robot. Autom. Lett. 6(2), 2517\u20132524 (2021)","journal-title":"IEEE Robot. Autom. Lett."},{"key":"25_CR8","doi-asserted-by":"crossref","unstructured":"Manousis, T., Passalis, N., Tefas, A.: Enabling high-resolution pose estimation in real time using active perception. In: Proceedings of the IEEE International Conference on Image Processing (ICIP), pp. 2425\u20132429 (2023)","DOI":"10.1109\/ICIP49359.2023.10223083"},{"key":"25_CR9","doi-asserted-by":"crossref","unstructured":"Agrawal, A., et al.: VQA: visual question answering. arXiv:1505.00468 (2016)","DOI":"10.1007\/s11263-016-0966-6"},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"Deng, Y., Guo, D., Guo, X., Zhang, N., Liu, H., Sun, F.: MQA: answering the question via robotic manipulation. In: Robotics: Science and Systems XVII, RSS2021. Robotics: Science and Systems Foundation (2021)","DOI":"10.15607\/RSS.2021.XVII.044"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Gordon, D., Kembhavi, A., Rastegari, M., Redmon, J., Fox, D., Farhadi, A.: IQA: visual question answering in interactive environments. arXiv:1712.03316 (2018)","DOI":"10.1109\/CVPR.2018.00430"},{"key":"25_CR12","doi-asserted-by":"crossref","unstructured":"Das, A., Datta, S., Gkioxari, G., Lee, S., Parikh, D., Batra, D.: Embodied question answering. arXiv:1711.11543 (2017)","DOI":"10.1109\/CVPR.2018.00008"},{"key":"25_CR13","doi-asserted-by":"crossref","unstructured":"Vemprala, S., Bonatti, R., Bucker, A., Kapoor, A.: ChatGPT for robotics: design principles and model abilities. arXiv:2306.17582 (2023)","DOI":"10.1109\/ACCESS.2024.3387941"},{"key":"25_CR14","unstructured":"Tazir, M.L., Mancas, M., Dutoit, T.: From words to flight: integrating OpenAI ChatGPT with px4\/gazebo for natural language-based drone control. In: Proceedings of the 13th International Workshop on Computer Science and Engineering (2023)"},{"key":"25_CR15","doi-asserted-by":"crossref","unstructured":"Ye, Y., You, H., Du, J.: Improved trust in human-robot collaboration with chatGPT. arXiv:2304.12529 (2023)","DOI":"10.1109\/ACCESS.2023.3282111"},{"key":"25_CR16","doi-asserted-by":"crossref","unstructured":"Liang, J., et al.: Code as policies: language model programs for embodied control. arXiv:2209.07753 (2023)","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"25_CR17","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual ChatGPT: talking, drawing and editing with visual foundation models. arXiv:2303.04671 (2023)"},{"key":"25_CR18","unstructured":"Shen, Y.,\u00a0Song, K.,\u00a0Tan, X.,\u00a0Li, D., Lu, W., Zhuang, Y.: HuggingGPT: solving AI tasks with chatgpt and its friends in hugging face. arXiv:2303.17580 (2023)"},{"key":"25_CR19","unstructured":"Wu, S., Fei, H., Qu, L., Ji, W., Chua, T.-S.: Next-GPT: any-to-any multimodal LLM. arXiv:2309.05519 (2023)"},{"key":"25_CR20","unstructured":"Shridhar, M., Manuelli, L., Fox, D.: CLIPort: what and where pathways for robotic manipulation. arXiv:2109.12098 (2021)"},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Bucker, A., Figueredo, L., Haddadin, S., Kapoor, A., Ma, S., Bonatti, R.: Reshaping robot trajectories using natural language commands: a study of multi-modal data alignment using transformers. arXiv:2203.13411 (2022)","DOI":"10.1109\/IROS47612.2022.9981810"},{"key":"25_CR22","unstructured":"Stepputtis, S., Campbell, J., Phielipp, M., Lee, S., Baral, C., Amor, H.B.: Language-conditioned imitation learning for robot manipulation tasks. arXiv:2010.12083 (2020)"},{"key":"25_CR23","unstructured":"Zhu, D., Chen, J., Haydarov, K., Shen, X., Zhang,W., Elhoseiny, M.: ChatGPT asks, blip-2 answers: automatic questioning towards enriched visual descriptions. arXiv:2303.06594 (2023)"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Rotstein, N., Bensaid, D., Brody, S., Ganz, R., Kimmel, R.: FuseCap: leveraging large language models for enriched fused image captions. arXiv:2305.17718 (2023)","DOI":"10.1109\/WACV57701.2024.00559"},{"key":"25_CR25","unstructured":"Levy, M., Ben-Ari, R., Darshan, N., Lischinski, D.: Chatting makes perfect: chat-based image retrieval. arXiv:2305.20062 (2023)"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Ricci, R., Bazi, Y., Melgani, F.: Machine-to-machine visual dialoguing with chatGPT for enriched textual image description. Remote Sens. 16(3) (2024)","DOI":"10.3390\/rs16030441"},{"key":"25_CR27","doi-asserted-by":"crossref","unstructured":"Hu, Y., Hua, H., Yang, Z., Shi, W., Smith, N.A., Luo, J.: PromptCap: prompt-guided task-aware image captioning. arXiv:2211.09699 (2023)","DOI":"10.1109\/ICCV51070.2023.00277"},{"key":"25_CR28","unstructured":"Yu, Z., Ouyang, X., Shao, Z., Wang, M., Yu, J.: Prophet: prompting large language models with complementary answer heuristics for knowledge-based visual question answering. arXiv:2303.01903 (2023)"},{"key":"25_CR29","doi-asserted-by":"crossref","unstructured":"Ravi, S., Chinchure, A., Sigal, L., Liao, R., Shwartz, V.: VLC-BERT: visual question answering with contextualized commonsense knowledge. arXiv:2210.13626 (2022)","DOI":"10.1109\/WACV56688.2023.00121"},{"key":"25_CR30","doi-asserted-by":"crossref","unstructured":"Tiong, A.M.H., Li, J., Li, B., Savarese, S., Hoi, S.C.H.: Plug-and-play VQA: zero-shot VQA by conjoining large pretrained models with zero training. arXiv:2210.08773 (2023)","DOI":"10.18653\/v1\/2022.findings-emnlp.67"},{"key":"25_CR31","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. arXiv:2201.12086 (2022)"},{"key":"25_CR32","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-CAM: visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"25_CR33","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Proceedings of the Advances in Neural Information Processing Systems, vol. 33, pp. 1877\u20131901 (2020)"},{"key":"25_CR34","unstructured":"Zhang, Y., et al.: Siren\u2019s song in the AI ocean: a survey on hallucination in large language models. arXiv:2309.01219 (2023)"},{"key":"25_CR35","doi-asserted-by":"crossref","unstructured":"Shah, S., Dey, D., Lovett, C., Kapoor, A.: AirSim: high-fidelity visual and physical simulation for autonomous vehicles. arXiv:1705.05065 (2017)","DOI":"10.1007\/978-3-319-67361-5_40"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78447-7_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T01:08:23Z","timestamp":1733188103000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78447-7_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"ISBN":["9783031784460","9783031784477"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78447-7_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"3 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}