{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T05:04:37Z","timestamp":1764997477739,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":41,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031200588"},{"type":"electronic","value":"9783031200595"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20059-5_28","type":"book-chapter","created":{"date-parts":[[2022,10,28]],"date-time":"2022-10-28T16:02:50Z","timestamp":1666972970000},"page":"485-501","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":13,"title":["AssistQ: Affordance-Centric Question-Driven Task Completion for\u00a0Egocentric Assistant"],"prefix":"10.1007","author":[{"given":"Benita","family":"Wong","sequence":"first","affiliation":[]},{"given":"Joya","family":"Chen","sequence":"additional","affiliation":[]},{"given":"You","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Stan Weixian","family":"Lei","sequence":"additional","affiliation":[]},{"given":"Dongxing","family":"Mao","sequence":"additional","affiliation":[]},{"given":"Difei","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Mike Zheng","family":"Shou","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,29]]},"reference":[{"key":"28_CR1","doi-asserted-by":"crossref","unstructured":"AlAmri, H., et al.: Audio visual scene-aware dialog. In: CVPR, pp. 7558\u20137567 (2019)","DOI":"10.1109\/CVPR.2019.00774"},{"key":"28_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"28_CR3","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: ICCV, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"28_CR4","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: ViViT: a video vision transformer. In: ICCV, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"28_CR5","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: ICLR (2015)"},{"key":"28_CR6","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML (2021)"},{"key":"28_CR7","unstructured":"Chadha, A., Arora, G., Kaloty, N.: iPerceive: applying common-sense reasoning to multi-modal dense video captioning and video question answering. arXiv:2011.07735 (2020)"},{"key":"28_CR8","unstructured":"Chung, J., G\u00fcl\u00e7ehre, \u00c7., Cho, K., Bengio, Y.: Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv:1412.3555 (2014)"},{"key":"28_CR9","doi-asserted-by":"crossref","unstructured":"Das, A., Datta, S., Gkioxari, G., Lee, S., Parikh, D., Batra, D.: Embodied question answering. In: CVPR, pp. 1\u201310 (2018)","DOI":"10.1109\/CVPR.2018.00008"},{"key":"28_CR10","unstructured":"Das, A., et al.: TarMAC: targeted multi-agent communication. In: ICML, pp. 1538\u20131546 (2019)"},{"key":"28_CR11","doi-asserted-by":"crossref","unstructured":"Das, A., Gkioxari, G., Lee, S., Parikh, D., Batra, D.: Neural modular control for embodied question answering. In: CoRL, pp. 53\u201362 (2018)","DOI":"10.1109\/CVPR.2018.00008"},{"key":"28_CR12","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: CVPR, pp. 1080\u20131089 (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"28_CR13","doi-asserted-by":"crossref","unstructured":"Das, A., Kottur, S., Moura, J.M.F., Lee, S., Batra, D.: Learning cooperative visual dialog agents with deep reinforcement learning. In: ICCV, pp. 2970\u20132979 (2017)","DOI":"10.1109\/ICCV.2017.321"},{"key":"28_CR14","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL, pp. 4171\u20134186 (2019)"},{"key":"28_CR15","unstructured":"Dosovitskiy, A., et al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"28_CR16","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Xiong, B., Girshick, R.B., He, K.: A large-scale study on unsupervised spatiotemporal representation learning. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"28_CR17","doi-asserted-by":"crossref","unstructured":"Gao, D., Wang, R., Bai, Z., Chen, X.: Env-QA: a video question answering benchmark for comprehensive understanding of dynamic environments. In: CVPR, pp. 1675\u20131685 (2021)","DOI":"10.1109\/ICCV48922.2021.00170"},{"key":"28_CR18","doi-asserted-by":"crossref","unstructured":"Gordon, D., Kembhavi, A., Rastegari, M., Redmon, J., Fox, D., Farhadi, A.: IQA: visual question answering in interactive environments. In: CVPR, pp. 4089\u20134098 (2018)","DOI":"10.1109\/CVPR.2018.00430"},{"issue":"4","key":"28_CR19","doi-asserted-by":"publisher","first-page":"398","DOI":"10.1007\/s11263-018-1116-0","volume":"127","author":"Y Goyal","year":"2019","unstructured":"Goyal, Y., Khot, T., Agrawal, A., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. Int. J. Comput. Vis. 127(4), 398\u2013414 (2019)","journal-title":"Int. J. Comput. Vis."},{"key":"28_CR20","unstructured":"Grauman, K., et al: Ego4D: around the world in 3, 000 hours of egocentric video. arXiv:2110.07058 (2021)"},{"issue":"8","key":"28_CR21","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"28_CR22","doi-asserted-by":"crossref","unstructured":"Jain, U., et al.: Two body problem: collaborative visual task completion. In: CVPR, pp. 6689\u20136699 (2019)","DOI":"10.1109\/CVPR.2019.00685"},{"key":"28_CR23","doi-asserted-by":"crossref","unstructured":"Jang, Y., Song, Y., Yu, Y., Kim, Y., Kim, G.: TGIF-QA: toward spatio-temporal reasoning in visual question answering. In: CVPR, pp. 1359\u20131367 (2017)","DOI":"10.1109\/CVPR.2017.149"},{"key":"28_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"160","DOI":"10.1007\/978-3-030-01267-0_10","volume-title":"Computer Vision \u2013 ECCV 2018","author":"S Kottur","year":"2018","unstructured":"Kottur, S., Moura, J.M.F., Parikh, D., Batra, D., Rohrbach, M.: Visual coreference resolution in visual dialog using neural module networks. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11219, pp. 160\u2013178. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01267-0_10"},{"key":"28_CR25","doi-asserted-by":"crossref","unstructured":"Lei, J., Yu, L., Bansal, M., Berg, T.L.: TVQA: localized, compositional video question answering. In: EMNLP, pp. 1369\u20131379 (2018)","DOI":"10.18653\/v1\/D18-1167"},{"key":"28_CR26","doi-asserted-by":"crossref","unstructured":"Lei, J., Yu, L., Berg, T.L., Bansal, M.: TVQA+: spatio-temporal grounding for video question answering. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J.R. (eds.) ACL, pp. 8211\u20138225 (2020)","DOI":"10.18653\/v1\/2020.acl-main.730"},{"key":"28_CR27","doi-asserted-by":"crossref","unstructured":"Li, Z., Li, Z., Zhang, J., Feng, Y., Zhou, J.: Bridging text and video: a universal multimodal transformer for audio-visual scene-aware dialog. IEEE ACM Trans. Audio Speech Lang. Process. 29, 2476\u20132483 (2021)","DOI":"10.1109\/TASLP.2021.3065823"},{"key":"28_CR28","unstructured":"Lin, K.Q., et al.: Egocentric video-language pretraining. arXiv:2206.01670 (2022)"},{"key":"28_CR29","unstructured":"Loshchilov, I., Hutter, F.: SGDR: stochastic gradient descent with warm restarts. In: ICLR (2017)"},{"key":"28_CR30","unstructured":"Paszke, A., Gross, S., Massa, F.E.A.: PyTorch: an imperative style, high-performance deep learning library. In: NeurIPS, pp. 8026\u20138037 (2019)"},{"key":"28_CR31","unstructured":"Sax, A., et al.: Learning to navigate using mid-level visual priors. In: CoRL, pp. 791\u2013812 (2019)"},{"key":"28_CR32","doi-asserted-by":"crossref","unstructured":"Schwartz, I., Schwing, A.G., Hazan, T.: A simple baseline for audio-visual scene-aware dialog. In: CVPR, pp. 12548\u201312558 (2019)","DOI":"10.1109\/CVPR.2019.01283"},{"key":"28_CR33","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., Zhu, Y., Stiefelhagen, R., Torralba, A., Urtasun, R., Fidler, S.: MovieQA: understanding stories in movies through question-answering. In: CVPR, pp. 4631\u20134640 (2016)","DOI":"10.1109\/CVPR.2016.501"},{"key":"28_CR34","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. arXiv:2203.12602 (2022)"},{"key":"28_CR35","doi-asserted-by":"crossref","unstructured":"Wang, A.J., et al.: All in one: exploring unified video-language pre-training. arXiv:2203.07303 (2022)","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"28_CR36","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Object-aware video-language pre-training for retrieval. In: CVPR, pp. 3313\u20133322 (2022)","DOI":"10.1109\/CVPR52688.2022.00331"},{"key":"28_CR37","doi-asserted-by":"crossref","unstructured":"Wortsman, M., Ehsani, K., Rastegari, M., Farhadi, A., Mottaghi, R.: Learning to learn how to learn: self-adaptive visual navigation using meta-learning. In: CVPR, pp. 6750\u20136759 (2019)","DOI":"10.1109\/CVPR.2019.00691"},{"key":"28_CR38","unstructured":"Yan, R., et al.: Video-text pre-training with learned regions. arXiv:2112.01194 (2021)"},{"key":"28_CR39","doi-asserted-by":"crossref","unstructured":"Yan, R., Xie, L., Tang, J., Shu, X., Tian, Q.: HiGCIN: hierarchical graph-based cross inference network for group activity recognition. IEEE Trans. Pattern Anal. Mach. Intell. 1 (2020). https:\/\/ieeexplore.ieee.org\/document\/9241410","DOI":"10.1109\/TPAMI.2020.3034233"},{"key":"28_CR40","unstructured":"Yang, W., Wang, X., Farhadi, A., Gupta, A., Mottaghi, R.: Visual semantic navigation using scene priors. In: ICLR (2019)"},{"key":"28_CR41","doi-asserted-by":"crossref","unstructured":"Yu, L., Chen, X., Gkioxari, G., Bansal, M., Berg, T.L., Batra, D.: Multi-target embodied question answering. In: CVPR, pp. 6309\u20136318 (2019)","DOI":"10.1109\/CVPR.2019.00647"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20059-5_28","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,29]],"date-time":"2023-11-29T18:59:33Z","timestamp":1701284373000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20059-5_28"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031200588","9783031200595"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20059-5_28","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"29 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}