{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T10:24:12Z","timestamp":1772101452215,"version":"3.50.1"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012458","type":"print"},{"value":"9783030012465","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01246-5_29","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T20:14:56Z","timestamp":1538770496000},"page":"485-501","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":47,"title":["Question-Guided Hybrid Convolution for Visual Question Answering"],"prefix":"10.1007","author":[{"given":"Peng","family":"Gao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongsheng","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuang","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pan","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yikang","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Steven C. H.","family":"Hoi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaogang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"29_CR1","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: Advances in neural information processing systems, pp. 1097\u20131105 (2012)"},{"key":"29_CR2","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: Advances in neural information processing systems, pp. 3104\u20133112 (2014)"},{"key":"29_CR3","doi-asserted-by":"crossref","unstructured":"Li, Y., Ouyang, W., Zhou, B., Wang, K., Wang, X.: Scene graph generation from objects, phrases and region captions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1261\u20131270 (2017)","DOI":"10.1109\/ICCV.2017.142"},{"key":"29_CR4","unstructured":"Xu, K., et al.: Show, attend and tell: Neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057 (2015)"},{"key":"29_CR5","doi-asserted-by":"crossref","unstructured":"Hu, R., Xu, H., Rohrbach, M., Feng, J., Saenko, K., Darrell, T.: Natural language object retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4555\u20134564 (2016)","DOI":"10.1109\/CVPR.2016.493"},{"key":"29_CR6","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"29_CR7","unstructured":"Frome, A., Corrado, G.S., Shlens, J., Bengio, S., Dean, J., Mikolov, T., et al.: DeViSE: a deep visual-semantic embedding model. In: Advances in neural information processing systems, pp. 2121\u20132129 (2013)"},{"key":"29_CR8","doi-asserted-by":"crossref","unstructured":"Reed, S., Akata, Z., Lee, H., Schiele, B.: Learning deep representations of fine-grained visual descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 49\u201358 (2016)","DOI":"10.1109\/CVPR.2016.13"},{"key":"29_CR9","unstructured":"Zhou, B., Tian, Y., Sukhbaatar, S., Szlam, A., Fergus, R.: Simple baseline for visual question answering. arXiv preprint arXiv:1512.02167 (2015)"},{"key":"29_CR10","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv preprint arXiv:1606.01847 (2016)","DOI":"10.18653\/v1\/D16-1044"},{"key":"29_CR11","unstructured":"Kim, J.H., On, K.W., Kim, J., Ha, J.W., Zhang, B.T.: Hadamard product for low-rank bilinear pooling. arXiv preprint arXiv:1610.04325 (2016)"},{"key":"29_CR12","doi-asserted-by":"crossref","unstructured":"Ben-younes, H., Cadene, R., Cord, M., Thome, N.: MUTAN: Multimodal tucker fusion for visual question answering. arXiv preprint arXiv:1705.06676 (2017)","DOI":"10.1109\/ICCV.2017.285"},{"key":"29_CR13","doi-asserted-by":"crossref","unstructured":"Chollet, F.: Xception: deep learning with depthwise separable convolutions. arXiv preprint arXiv:1610.02357 (2016)","DOI":"10.1109\/CVPR.2017.195"},{"key":"29_CR14","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R., Doll\u00e1r, P., Tu, Z., He, K.: Aggregated residual transformations for deep neural networks. arXiv preprint arXiv:1611.05431 (2016)","DOI":"10.1109\/CVPR.2017.634"},{"key":"29_CR15","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., RoyChowdhury, A., Maji, S.: Bilinear CNN models for fine-grained visual recognition. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1449\u20131457 (2015)","DOI":"10.1109\/ICCV.2015.170"},{"key":"29_CR16","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)"},{"key":"29_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1007\/978-3-319-46478-7_28","volume-title":"Computer Vision \u2013 ECCV 2016","author":"H Xu","year":"2016","unstructured":"Xu, H., Saenko, K.: Ask, attend and answer: exploring question-guided spatial attention for visual question answering. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9911, pp. 451\u2013466. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_28"},{"key":"29_CR18","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., Smola, A.: Stacked attention networks for image question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 21\u201329 (2016)","DOI":"10.1109\/CVPR.2016.10"},{"key":"29_CR19","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical question-image co-attention for visual question answering. In: Advances In Neural Information Processing Systems, pp. 289\u2013297 (2016)"},{"key":"29_CR20","doi-asserted-by":"crossref","unstructured":"Lu, P., Li, H., Zhang, W., Wang, J., Wang, X.: Co-attending free-form regions and detections with multi-modal multiplicative feature embedding for visual question answering. In: Proceedings of AAAI, pp. 7218\u20137225 (2018)","DOI":"10.1609\/aaai.v32i1.12240"},{"key":"29_CR21","unstructured":"Chen, K., Wang, J., Chen, L.C., Gao, H., Xu, W., Nevatia, R.: ABC-CNN: an attention based convolutional neural network for visual question answering. arXiv preprint arXiv:1511.05960 (2015)"},{"key":"29_CR22","doi-asserted-by":"crossref","unstructured":"Noh, H., Hongsuck Seo, P., Han, B.: Image question answering using convolutional neural network with dynamic parameter prediction. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 30\u201338 (2016)","DOI":"10.1109\/CVPR.2016.11"},{"key":"29_CR23","unstructured":"de Vries, H., Strub, F., Mary, J., Larochelle, H., Pietquin, O., Courville, A.: Modulating early visual processing by language. arXiv preprint arXiv:1707.00683 (2017)"},{"key":"29_CR24","doi-asserted-by":"crossref","unstructured":"Li, Z., Tao, R., Gavves, E., Snoek, C.G., Smeulders, A., et al.: Tracking by natural language specification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6495\u20136503 (2017)","DOI":"10.1109\/CVPR.2017.777"},{"key":"29_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, X., Zhou, X., Lin, M., Sun, J.: ShuffleNet: an extremely efficient convolutional neural network for mobile devices. arXiv preprint arXiv:1707.01083 (2017)","DOI":"10.1109\/CVPR.2018.00716"},{"key":"29_CR26","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Visual question generation as dual task of visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6116\u20136124 (2018)","DOI":"10.1109\/CVPR.2018.00640"},{"key":"29_CR27","doi-asserted-by":"crossref","unstructured":"Li, S., Xiao, T., Li, H., Yang, W., Wang, X.: Identity-aware textual-visual matching with latent co-attention. In: IEEE International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.209"},{"key":"29_CR28","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"29_CR29","unstructured":"Salimans, T., Kingma, D.P.: Weight normalization: a simple reparameterization to accelerate training of deep neural networks. In: Advances in Neural Information Processing Systems, pp. 901\u2013909 (2016)"},{"key":"29_CR30","doi-asserted-by":"crossref","unstructured":"Johnson, J., Hariharan, B., van der Maaten, L., Fei-Fei, L., Zitnick, C.L., Girshick, R.: CLEVR: a diagnostic dataset for compositional language and elementary visual reasoning. arXiv preprint arXiv:1612.06890 (2016)","DOI":"10.1109\/CVPR.2017.215"},{"key":"29_CR31","unstructured":"Kiros, R., et al.: Skip-thought vectors. In: Advances in neural information processing systems, pp. 3294\u20133302 (2015)"},{"key":"29_CR32","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2921\u20132929 (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"29_CR33","unstructured":"Yu, Z., Yu, J., Fan, J., Tao, D.: Multi-modal factorized bilinear pooling with co-attention learning for visual question answering"},{"key":"29_CR34","doi-asserted-by":"crossref","unstructured":"Li, S., Xiao, T., Li, H., Zhou, B., Yue, D., Wang, X.: Person search with natural language description. In: IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.551"},{"key":"29_CR35","doi-asserted-by":"crossref","unstructured":"Andreas, J., Rohrbach, M., Darrell, T., Klein, D.: Neural module networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 39\u201348 (2016)","DOI":"10.1109\/CVPR.2016.12"},{"key":"29_CR36","unstructured":"Kim, J.H., et al.: Multimodal residual learning for visual QA. In: Advances in Neural Information Processing Systems, pp. 361\u2013369 (2016)"},{"key":"29_CR37","doi-asserted-by":"crossref","unstructured":"Andreas, J., Rohrbach, M., Darrell, T., Klein, D.: Learning to compose neural networks for question answering. In: Proceedings of NAACL-HLT, pp. 1545\u20131554 (2016)","DOI":"10.18653\/v1\/N16-1181"},{"key":"29_CR38","unstructured":"Noh, H., Han, B.: Training recurrent answering units with joint loss minimization for VQA. arXiv preprint arXiv:1606.03647 (2016)"},{"key":"29_CR39","doi-asserted-by":"crossref","unstructured":"Li, Y., Ouyang, W., Wang, X., Tang, X.: ViP-CNN: Visual phrase guided convolutional neural network. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.766"},{"key":"29_CR40","doi-asserted-by":"crossref","unstructured":"Lu, P., Ji, L., Zhang, W., Duan, N., Zhou, M., Wang, J.: R-VQA: learning visual relation facts with semantic attention for visual question answering. In: Proceedings of SIGKDD (2018)","DOI":"10.1145\/3219819.3220036"},{"key":"29_CR41","doi-asserted-by":"crossref","unstructured":"Li, S., Bak, S., Carr, P., Wang, X.: Diversity regularized spatiotemporal attention for video-based person re-identification. In: IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00046"},{"key":"29_CR42","doi-asserted-by":"crossref","unstructured":"Nam, H., Ha, J.W., Kim, J.: Dual attention networks for multimodal reasoning and matching. arXiv preprint arXiv:1611.00471 (2016)","DOI":"10.1109\/CVPR.2017.232"},{"key":"29_CR43","doi-asserted-by":"crossref","unstructured":"Hu, R., Andreas, J., Rohrbach, M., Darrell, T., Saenko, K.: Learning to reason: end-to-end module networks for visual question answering. arXiv preprint arXiv:1704.05526 (2017)","DOI":"10.1109\/ICCV.2017.93"},{"key":"29_CR44","doi-asserted-by":"crossref","unstructured":"Johnson, J., et al.: Inferring and executing programs for visual reasoning. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.325"},{"key":"29_CR45","doi-asserted-by":"crossref","unstructured":"Perez, E., Strub, F., De Vries, H., Dumoulin, V., Courville, A.: Film: visual reasoning with a general conditioning layer. arXiv preprint arXiv:1709.07871 (2017)","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"29_CR46","doi-asserted-by":"crossref","unstructured":"Li, Y., Ouyang, W., Zhou, B., Shi, J., Zhang, C., Wang, X.: Factorizable net: an efficient subgraph-based framework for scene graph generation. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01246-5_21"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01246-5_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,5]],"date-time":"2022-10-05T00:19:31Z","timestamp":1664929171000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01246-5_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012458","9783030012465"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01246-5_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}