{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T00:08:56Z","timestamp":1755907736569,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,12,15]],"date-time":"2023-12-15T00:00:00Z","timestamp":1702598400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,15]]},"DOI":"10.1145\/3627631.3627659","type":"proceedings-article","created":{"date-parts":[[2024,1,31]],"date-time":"2024-01-31T12:08:32Z","timestamp":1706702912000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Aggregated Co-attention based Visual Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0792-3006","authenticated-orcid":false,"given":"Aakansha","family":"Mishra","sequence":"first","affiliation":[{"name":"IIT Guwahati, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0024-3358","authenticated-orcid":false,"given":"Ashish","family":"Anand","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology Guwahati, IN"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2885-0026","authenticated-orcid":false,"given":"Prithwijit","family":"Guha","sequence":"additional","affiliation":[{"name":"IIT Guwahati, IN"}]}],"member":"320","published-online":{"date-parts":[[2024,1,31]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_2_1","volume-title":"Learning to compose neural networks for question answering. arXiv preprint arXiv:1601.01705","author":"Andreas Jacob","year":"2016","unstructured":"Jacob Andreas, Marcus Rohrbach, Trevor Darrell, and Dan Klein. 2016. Learning to compose neural networks for question answering. arXiv preprint arXiv:1601.01705 (2016)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.285"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018102"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00209"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_8_1","unstructured":"Christopher Clark Mark Yatskar and Luke Zettlemoyer. 2019. Don\u2019t Take the Easy Way Out: Ensemble Based Methods for Avoiding Known Dataset Biases. arxiv:1909.03683"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00048"},{"key":"e_1_3_2_1_10_1","volume-title":"Question-Agnostic Attention for Visual Question Answering. In 25th International Conference on Pattern Recognition (ICPR). IEEE, 3542\u20133549","author":"Farazi Moshiur","year":"2021","unstructured":"Moshiur Farazi, Salman Khan, and Nick Barnes. 2021. Question-Agnostic Attention for Visual Question Answering. In 25th International Conference on Pattern Recognition (ICPR). IEEE, 3542\u20133549."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1044"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2020.2989701"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00680"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the IEEE International Conference on Computer Vision. 5825\u20135835","author":"Gao Peng","year":"2019","unstructured":"Peng Gao, Haoxuan You, Zhanpeng Zhang, Xiaogang Wang, and Hongsheng Li. 2019. Multi-modality Latent Interaction Network for Visual Question Answering. In Proceedings of the IEEE International Conference on Computer Vision. 5825\u20135835."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.41"},{"key":"e_1_3_2_1_16_1","volume-title":"IQ-VQA: Intelligent Visual Question Answering. In International Workshop on Video and Image Question Answering (ICPR 2021 Workshops)(Lecture Notes in Computer Science (LNCS), Vol.\u00a011942)","author":"Goel Vatsal","year":"2021","unstructured":"Vatsal Goel, Mohit Chandak, Ashish Anand, and Prithwijit Guha. 2021. IQ-VQA: Intelligent Visual Question Answering. In International Workshop on Video and Image Question Answering (ICPR 2021 Workshops)(Lecture Notes in Computer Science (LNCS), Vol.\u00a011942). 525\u2013532."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.217"},{"key":"e_1_3_2_1_22_1","volume-title":"ask, attend, and answer: A strong baseline for visual question answering. arXiv preprint arXiv:1704.03162","author":"Kazemi Vahid","year":"2017","unstructured":"Vahid Kazemi and Ali Elqursh. 2017. Show, ask, attend, and answer: A strong baseline for visual question answering. arXiv preprint arXiv:1704.03162 (2017)."},{"key":"e_1_3_2_1_23_1","unstructured":"Jin-Hwa Kim Jaehyun Jun and Byoung-Tak Zhang. 2018. Bilinear Attention Networks. In Advances in Neural Information Processing Systems. 1564\u20131574."},{"key":"e_1_3_2_1_24_1","volume-title":"Hadamard Product for Low-rank Bilinear Pooling. arXiv preprint arXiv:1610.04325","author":"Kim Jin-Hwa","year":"2016","unstructured":"Jin-Hwa Kim, Kyoung-Woon On, Woosang Lim, Jeonghee Kim, Jung-Woo Ha, and Byoung-Tak Zhang. 2016. Hadamard Product for Low-rank Bilinear Pooling. arXiv preprint arXiv:1610.04325 (2016)."},{"key":"e_1_3_2_1_25_1","unstructured":"Jiasen Lu Jianwei Yang Dhruv Batra and Devi Parikh. 2016. Hierarchical Question-Image Co-attention for Visual Question Answering. In Advances In Neural Information Processing Systems. 289\u2013297."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.5555\/3016387.3016405"},{"key":"e_1_3_2_1_27_1","volume-title":"Efficient Estimation of Word Representations in Vector Space. arXiv preprint arXiv:1301.3781","author":"Mikolov Tomas","year":"2013","unstructured":"Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. 2013. Efficient Estimation of Word Representations in Vector Space. arXiv preprint arXiv:1301.3781 (2013)."},{"volume-title":"CQ-VQA: Visual Question Answering on Categorized Questions. In 2020 International Joint Conference on Neural Networks (IJCNN). 1\u20138.","author":"Mishra A.","key":"e_1_3_2_1_28_1","unstructured":"A. Mishra, A. Anand, and P. Guha. 2020. CQ-VQA: Visual Question Answering on Categorized Questions. In 2020 International Joint Conference on Neural Networks (IJCNN). 1\u20138."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9413173"},{"key":"e_1_3_2_1_30_1","volume-title":"Dual Attention and Question Categorization based Visual Question Answering","author":"Mishra Aakansha","year":"2022","unstructured":"Aakansha Mishra, Ashish Anand, and Prithwijit Guha. 2022. Dual Attention and Question Categorization based Visual Question Answering. IEEE Transactions on Artificial Intelligence (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"Out of the box: Reasoning with graph convolution nets for factual visual question answering. Advances in Neural Information Processing Systems 31","author":"Narasimhan Medhini","year":"2018","unstructured":"Medhini Narasimhan, Svetlana Lazebnik, and Alexander Schwing. 2018. Out of the box: Reasoning with graph convolution nets for factual visual question answering. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00637"},{"key":"e_1_3_2_1_33_1","volume-title":"Training Recurrent Answering Units with Joint Loss Minimization for VQA. arXiv preprint arXiv:1606.03647","author":"Noh Hyeonwoo","year":"2016","unstructured":"Hyeonwoo Noh and Bohyung Han. 2016. Training Recurrent Answering Units with Joint Loss Minimization for VQA. arXiv preprint arXiv:1606.03647 (2016)."},{"key":"e_1_3_2_1_34_1","unstructured":"Will Norcliffe-Brown Stathis Vafeias and Sarah Parisot. 2018. Learning Conditioned Graph Structures for Interpretable Visual Question Answering. In Advances in Neural Information Processing Systems. 8334\u20138343."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_36_1","volume-title":"International conference on machine learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_37_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In Advances in Neural Information Processing Systems. 91\u201399."},{"key":"e_1_3_2_1_38_1","unstructured":"Adam Santoro David Raposo David\u00a0G Barrett Mateusz Malinowski Razvan Pascanu Peter Battaglia and Timothy Lillicrap. 2017. A Simple Neural Network Module for Relational Reasoning. In Advances in Neural Information Processing Systems. 4967\u20134976."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_10"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.499"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01072"},{"key":"e_1_3_2_1_42_1","volume-title":"Very Deep Convolutional Networks for Large-scale Image Recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very Deep Convolutional Networks for Large-scale Image Recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_43_1","volume-title":"An empirical study on the generalization power of neural representations learned via visual guessing games. arXiv preprint arXiv:2102.00424","author":"Suglia Alessandro","year":"2021","unstructured":"Alessandro Suglia, Yonatan Bisk, Ioannis Konstas, Antonio Vergari, Emanuele Bastianelli, Andrea Vanzo, and Oliver Lemon. 2021. An empirical study on the generalization power of neural representations learned via visual guessing games. arXiv preprint arXiv:2102.00424 (2021)."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the Asian Conference on Computer Vision.","author":"Sun Qiang","year":"2020","unstructured":"Qiang Sun, Binghui Xie, and Yanwei Fu. 2020. Second Order enhanced Multi-glimpse Attention in Visual Question Answering. In Proceedings of the Asian Conference on Computer Vision."},{"key":"e_1_3_2_1_45_1","volume-title":"LXMERT: Learning Cross-modality Encoder Representations from Transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-modality Encoder Representations from Transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.344"},{"key":"e_1_3_2_1_47_1","volume-title":"Cascading Top-Down Attention for Visual Question Answering. In 2020 International Joint Conference on Neural Networks (IJCNN). IEEE, 1\u20137.","author":"Tian Weidong","year":"2020","unstructured":"Weidong Tian, Rencai Zhou, and Zhongqiu Zhao. 2020. Cascading Top-Down Attention for Visual Question Answering. In 2020 International Joint Conference on Neural Networks (IJCNN). IEEE, 1\u20137."},{"key":"e_1_3_2_1_48_1","first-page":"275","article-title":"Chain of Reasoning for Visual Question Answering","volume":"31","author":"Wu Chenfei","year":"2018","unstructured":"Chenfei Wu, Jinlai Liu, Xiaojie Wang, and Xuan Dong. 2018. Chain of Reasoning for Visual Question Answering. Advances in Neural Information Processing Systems 31 (2018), 275\u2013285.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_49_1","volume-title":"Attend and Answer: Exploring Question-Guided Spatial Attention for Visual Question Answering. In European Conference on Computer Vision. Springer, 451\u2013466","author":"Xu Huijuan","year":"2016","unstructured":"Huijuan Xu and Kate Saenko. 2016. Ask, Attend and Answer: Exploring Question-Guided Spatial Attention for Visual Question Answering. In European Conference on Computer Vision. Springer, 451\u2013466."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.202"},{"key":"e_1_3_2_1_52_1","volume-title":"Beyond Bilinear: Generalized Multimodal Factorized High-order Pooling for Visual Question Answering","author":"Yu Zhou","year":"2018","unstructured":"Zhou Yu, Jun Yu, Chenchao Xiang, Jianping Fan, and Dacheng Tao. 2018. Beyond Bilinear: Generalized Multimodal Factorized High-order Pooling for Visual Question Answering. IEEE Transactions on Neural Networks and Learning Systems99 (2018), 1\u201313."},{"key":"e_1_3_2_1_53_1","volume-title":"Learning to count objects in natural images for visual question answering. arXiv preprint arXiv:1802.05766","author":"Zhang Yan","year":"2018","unstructured":"Yan Zhang, Jonathon Hare, and Adam Pr\u00fcgel-Bennett. 2018. Learning to count objects in natural images for visual question answering. arXiv preprint arXiv:1802.05766 (2018)."},{"key":"e_1_3_2_1_54_1","volume-title":"Simple baseline for visual question answering. arXiv preprint arXiv:1512.02167","author":"Zhou Bolei","year":"2015","unstructured":"Bolei Zhou, Yuandong Tian, Sainbayar Sukhbaatar, Arthur Szlam, and Rob Fergus. 2015. Simple baseline for visual question answering. arXiv preprint arXiv:1512.02167 (2015)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.145"}],"event":{"name":"ICVGIP '23: Indian Conference on Computer Vision, Graphics and Image Processing","acronym":"ICVGIP '23","location":"Rupnagar India"},"container-title":["Proceedings of the Fourteenth Indian Conference on Computer Vision, Graphics and Image Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627631.3627659","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627631.3627659","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:50:46Z","timestamp":1755892246000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627631.3627659"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,15]]},"references-count":55,"alternative-id":["10.1145\/3627631.3627659","10.1145\/3627631"],"URL":"https:\/\/doi.org\/10.1145\/3627631.3627659","relation":{},"subject":[],"published":{"date-parts":[[2023,12,15]]},"assertion":[{"value":"2024-01-31","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}