{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T16:21:41Z","timestamp":1780503701200,"version":"3.54.1"},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"29","license":[{"start":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T00:00:00Z","timestamp":1683244800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T00:00:00Z","timestamp":1683244800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s11042-023-15606-4","type":"journal-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T06:02:29Z","timestamp":1683266549000},"page":"45679-45697","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Cross-modal multi-headed attention for long multimodal conversations"],"prefix":"10.1007","volume":"82","author":[{"given":"Harshith","family":"Belagur","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"N. Saketh","family":"Reddy","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"P. Radha","family":"Krishna","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Raj","family":"Tumuluri","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2023,5,5]]},"reference":[{"key":"15606_CR1","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, Mitchell M, Batra D, Zitnick C L, Parikh D (2015) VQA: Visual question answering. 2015 IEEE International Conference on Computer Vision (ICCV 2015), Santiago, Chile, 2425\u20132433","DOI":"10.1109\/ICCV.2015.279"},{"issue":"4","key":"15606_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2766959","volume":"34","author":"S Bell","year":"2015","unstructured":"Bell S, Bala K (2015) Learning Visual Similarity for Product Design with Convolutional Neural Networks. ACM Trans Graph (TOG) 34(4):1\u201310","journal-title":"ACM Trans Graph (TOG)"},{"key":"15606_CR3","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1162\/tacl_a_00051","volume":"5","author":"P Bojanowski","year":"2016","unstructured":"Bojanowski P, Grave E, Joulin A, Mikolov T (2016) Enriching Word Vectors with Subword Information. Trans Assoc Comput Linguist 5:135\u2013146","journal-title":"Trans Assoc Comput Linguist"},{"key":"15606_CR4","first-page":"427","volume":"2","author":"P Bojanowski","year":"2016","unstructured":"Bojanowski P, Grave E, Joulin A, Mikolov T (2016) Bag of Tricks for Efficient Text Classification. Proc. of the 15th Conference of the European Chapter of the Association for Computational Linguistics (EACL). Valencia, Spain, ACL 2:427\u2013431","journal-title":"Valencia, Spain, ACL"},{"key":"15606_CR5","doi-asserted-by":"crossref","unstructured":"Chauhan H, Firdaus M, Ekbal A, Bhattacharyya P (2019) Ordinal and Attribute Aware Response Generation in a Multimodal Dialogue System. Proc. of the 57th Annual Meeting of the Association for Computational Linguistics 5437\u20135447.","DOI":"10.18653\/v1\/P19-1540"},{"key":"15606_CR6","doi-asserted-by":"publisher","first-page":"195","DOI":"10.1016\/j.neucom.2020.10.042","volume":"426","author":"W Chen","year":"2021","unstructured":"Chen W, Wang W, Liu L, Lew MS (2021) New Ideas and Trends in Deep Multimodal Content Understanding: A Review. Neurocomputing 426:195\u2013215","journal-title":"Neurocomputing"},{"key":"15606_CR7","doi-asserted-by":"crossref","unstructured":"Das A, Kottur S, Gupta K, Singh A, Yadav D, Moura J M F, Parikh D, Batra D (2017) Visual dialog. Proc. of the IEEE Computer Vision and Pattern Recognition (CVPR), IEEE Xplore Honolulu, HI, USA, 326\u2013335","DOI":"10.1109\/CVPR.2017.121"},{"key":"15606_CR8","unstructured":"Devlin J, Chang M-W, Lee K, Toutanov K (2019) BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, (NAACL), 1, 4171\u20134186."},{"key":"15606_CR9","doi-asserted-by":"publisher","first-page":"57","DOI":"10.3389\/fpsyg.2021.664747","volume":"12","author":"M Fatigante","year":"2021","unstructured":"Fatigante M, Zucchermaglio C, Alby F (2021) Being in Place: A Multimodal Analysis of the Contribution of the Patient\u2019s Companion to \u201cFirst Time\u201d Oncological Visits. Front Psychol 12:57\u201379. https:\/\/doi.org\/10.3389\/fpsyg.2021.664747","journal-title":"Front Psychol"},{"issue":"8","key":"15606_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.14201\/ADCAIJ2014381326","volume":"3","author":"D Griol","year":"2014","unstructured":"Griol D, Molina JM, de Miguel AS (2014) Developing multimodal conversational agents for an enhanced e-learning experience. Adv Distrib Comput Artif Intell J 3(8):1\u201313. https:\/\/doi.org\/10.14201\/ADCAIJ2014381326","journal-title":"Adv Distrib Comput Artif Intell J"},{"key":"15606_CR11","doi-asserted-by":"crossref","unstructured":"Han X, Wu Z, Huang P X, Zhang X, Zhu M, Li Y, Zhao Y, Davis L S (2017) Automatic Spatially-Aware Fashion Concept Discovery. 2017 IEEE International Conference on Computer Vision (ICCV), 1472\u20131480.","DOI":"10.1109\/ICCV.2017.163"},{"key":"15606_CR12","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep Residual Learning for Image Recognition. IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"15606_CR13","doi-asserted-by":"crossref","unstructured":"Hsiao J -H, Li L -J (2014) On Visual Similarity based Interactive Product Recommendation for Online Shopping. 2014 IEEE International Conference on Image Processing (ICIP) 3038\u20133041","DOI":"10.1109\/ICIP.2014.7025614"},{"key":"15606_CR14","doi-asserted-by":"crossref","unstructured":"Jiang S, Rijke M de (2018) Why are sequence-to-sequence models so dull? Understanding the low-diversity problem of chatbots. Proc. of the 2018 EMNLP Workshop on Search-Oriented Conversational AI (SCAI), Brussels, Belgium. 81\u201386","DOI":"10.18653\/v1\/W18-5712"},{"key":"15606_CR15","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1016\/j.knosys.2006.11.014","volume":"20","author":"A Kerly","year":"2007","unstructured":"Kerly A, Hall P, Bull S (2007) Bringing chatbots into education: Towards natural language negotiation of open learner models. Knowl Based Syst 20:177\u2013185","journal-title":"Knowl Based Syst"},{"key":"15606_CR16","unstructured":"Kingma D P, Adam J Ba (2015) A method for stochastic optimization. 3rd International Conference for Learning Representations, San Diego"},{"key":"15606_CR17","doi-asserted-by":"crossref","unstructured":"Laenen K, Zoghbi S, Moens M-F (2018) Web Search of Fashion Items with Multimodal Querying. Proc. of 11th ACM International Conference on Web Search and Data Mining (WSDM 2018), Marina Del Rey, CA, USA.","DOI":"10.1145\/3159652.3159716"},{"key":"15606_CR18","first-page":"74","volume-title":"Text Summarization Branches Out","author":"C-Y Lin","year":"2004","unstructured":"Lin C-Y (2004) ROUGE: A Package for Automatic Evaluation of Summaries. Text Summarization Branches Out. Spain ACL, Barcelona, pp 74\u201381"},{"key":"15606_CR19","unstructured":"Mikolov T, Chen K, Corrado G, Dean J (2013) Efficient Estimation of Word Representations in Vector Space. Proc. of Workshop at ICLR. arXiv:1301.3781v1"},{"key":"15606_CR20","unstructured":"Mostafazadeh N, Brockett C, Dolan B, Galley M, Gao J, Spithourakis G P, Vanderwende L (2017) Image grounded conversations: Multimodal context for natural question and response generation. Proc. of the Eighth International Joint Conference on Natural Language Processing (IJACNLP), Taipei, Taiwan. 1, 462\u2013472."},{"key":"15606_CR21","unstructured":"Nils R, Gurevych I (2019) Sentence-Bert: Sentence Embeddings Using Siamese Bert-Networks. ArXiv.org, 27 Aug. 2019."},{"key":"15606_CR22","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, ZhuBLEU W J (2002) A method for automatic evaluation of machine translation. Proc. of the 40th Annual Meeting on Association for Computational Linguistics (ACL 2002), 311\u2013318.","DOI":"10.3115\/1073083.1073135"},{"key":"15606_CR23","unstructured":"Paranjape A, See A, Kenealy K, Li H, Hardy A, Qi P, Sadagopan K R, Phu N M, Soylu D, Manning C D (2020) Neural generation meets real people: Towards emotionally engaging mixed-initiative conversations. Stanford NLP, 3rd Proceedings of Alexa Prize. arXiv:2008.12348"},{"key":"15606_CR24","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning C (2014) GloVe: Global Vectors for Word Representation. Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) , Doha, Qatar, ACL, 1532\u20131543.","DOI":"10.3115\/v1\/D14-1162"},{"key":"15606_CR25","doi-asserted-by":"crossref","unstructured":"Rajpurkar P, Zhang J, Lopyrev K, Liang P (2016) Squad: 100,000+ questions for machine comprehension of text. Proc. of the 2016 Conference on Empirical Methods in Natural Language Processing (EMNLP), Texas, US, ACL, 2383\u20132392.","DOI":"10.18653\/v1\/D16-1264"},{"key":"15606_CR26","doi-asserted-by":"publisher","unstructured":"Roccetti M, Marfia G, Salomoni P, Prandi C, Zagari R M, Kengni FLG, Bazzoli F, Montagnani M (2017) Attitudes of Crohn's Disease Patients: Infodemiology Case Study and Sentiment Analysis of Facebook and Twitter Posts. JMIR Public Health Surveill. 3(3) https:\/\/doi.org\/10.2196\/publichealth.7004","DOI":"10.2196\/publichealth.7004"},{"key":"15606_CR27","doi-asserted-by":"crossref","unstructured":"Saha A, Khapra M M, Sankaranarayanan K (2018) Towards building large scale multimodal domain-aware conversation systems. Proc. of 32nd AAAI Conference on Artificial Intelligence 696\u2013704.","DOI":"10.1609\/aaai.v32i1.11331"},{"key":"15606_CR28","doi-asserted-by":"crossref","unstructured":"Sapna C R, Anagha M, Vats K, Baradia K, Khan T, Sarkar S, Roychowdhury S (2019) Recommendence and fashionsence online fashion advisor for offline experience. ACM International Conference Proceeding series, 256\u2013259.","DOI":"10.1145\/3297001.3297035"},{"key":"15606_CR29","doi-asserted-by":"crossref","unstructured":"Schaffer S, Reithinger N (2019) Conversation is multimodal: thus conversational user interfaces should be as well. Proc. of the 1st International Conference on Conversational User Interfaces (CUI '19). ACM, New York, NY, USA. Article 12, 1\u20133.","DOI":"10.1145\/3342775.3342801"},{"key":"15606_CR30","doi-asserted-by":"crossref","unstructured":"Serban V, Sordoni A, Lowe R, Charlin L, Pineau J, Courville A C, Bengio Y (2017) A hierarchical latent variable encoder-decoder model for generating dialogues. Proc of AAAI, 3295\u20133301","DOI":"10.1609\/aaai.v31i1.10983"},{"key":"15606_CR31","unstructured":"Shubham A, Dusek O, Konstas I, Rieser V (2018) Improving context modeling in multimodal dialogue generation. Proc. of 11th International Conference on Natural Language Generation 129\u2013134"},{"key":"15606_CR32","unstructured":"Simonoyan K, Zisserman A (2015) Very Deep Convolutional Networks for Large-Scale Image Recognition. Proc. of 3rd International Conference on Learning Representations (ICLR 2015), San Diego, CA, USA."},{"key":"15606_CR33","doi-asserted-by":"crossref","unstructured":"Tao C, Gao S, Shang M, Wu W, Zhao D, Yan R (2018) Get the point of my utterance! Learning towards effective responses with a multi-head attention mechanism. Proc. of the 27th International Joint Conference on Artificial Intelligence 4418\u20134424.","DOI":"10.24963\/ijcai.2018\/614"},{"key":"15606_CR34","doi-asserted-by":"crossref","unstructured":"Thomas NT (2016) An e-business chatbot using AIML and LSA, Proc. Int. Conf. Adv. Computing Commun. Informat. (ICACCI), 2740\u20132742","DOI":"10.1109\/ICACCI.2016.7732476"},{"key":"15606_CR35","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez A N, Kaiser L, Polosukhin I (2017) Attention is All you Need. In Advances in Neural Information Processing Systems 30 (NIPS 2017), 1\u201311."},{"key":"15606_CR36","unstructured":"Vries H de, Strub F, Chandar S, Pietquin O, Larochelle H, Courville AC (2017) Guesswhat?! visual object discovery through multimodal dialogue. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 4466\u20134475."},{"key":"15606_CR37","doi-asserted-by":"crossref","unstructured":"Xu A, Liu Z, Guo Y, Sinha V, Akkiraju R (2017) A new chatbot for customer service on social media, Proc. CHI Conf. Human Factors Comput. Syst. (CHI) 3506\u20133510","DOI":"10.1145\/3025453.3025496"},{"key":"15606_CR38","doi-asserted-by":"crossref","unstructured":"Zhao B, Feng J, Wu X, Yan S (2017) Memory-Augmented Attribute Manipulation Networks for Interactive Fashion Search. IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2017) 6156\u20136164","DOI":"10.1109\/CVPR.2017.652"},{"issue":"1","key":"15606_CR39","doi-asserted-by":"publisher","first-page":"31","DOI":"10.17706\/IJCEE.2016.8.1.31-43","volume":"8","author":"S Zoghbi","year":"2016","unstructured":"Zoghbi S, Heyman G, Gomez JC, Moens M-F (2016) Fashion Meets Computer Vision and NLP at e-Commerce Search. Int J Comput Elec Eng (IJCEE) 8(1):31\u201343","journal-title":"Int J Comput Elec Eng (IJCEE)"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-023-15606-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-023-15606-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-023-15606-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,28]],"date-time":"2023-11-28T10:23:49Z","timestamp":1701167029000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-023-15606-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,5]]},"references-count":39,"journal-issue":{"issue":"29","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["15606"],"URL":"https:\/\/doi.org\/10.1007\/s11042-023-15606-4","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,5,5]]},"assertion":[{"value":"6 November 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 May 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 April 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 May 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interests"}}]}}