@article{Corpataux:15235,
      recid = {15235},
      author = {Corpataux, Sam and Capallera, Marine and Khaler, Omar Abou  and Mugellini, Elena},
      title = {Enhancing user immersion in virtual reality by integrating  collective emotions through audio-visual analysis},
      publisher = {AHFE International},
      journal = {Proceedings of the 15th International Conference on  Applied Human Factors and Ergonomics (AHFE 2024), 24-27  July 2024, Nice, France ; Affective and Pleasurable Design},
      address = {New York, USA. 2024-07},
      number = {CONFERENCE},
      pages = {12 p.},
      abstract = {Abstract: In the rapidly evolving field of virtual reality  (VR), deep user immersion remains a major challenge for  researchers and developers alike. Effectively integrating  emotional cues into VR environments to enhance the user  experience could be a key issue. This study introduces a  positive advancement by presenting an innovative solution  that combines audio and video analysis to detect and  integrate collective emotion while watching 360° events.  Our approach sets itself apart from previous work, which  focused either on the visual or auditory aspect, by  embracing a holistic perspective that more accurately  mirrors the complexity of the human experience. We  developed a machine-learning architecture that utilizes  advanced models. Existing datasets have been enriched in a  balanced way and are used to train various models,  including a face extraction model, and emotion  classification models based on spectrograms and audio  features. Predictions from these analyses are merged to  generate a value representative of crowd emotion valence  using 360° videos as input. All models and the final  architecture are assessed using accuracy, F1-score,  precision and recall metrics. This proposed architecture  enabled the creation of a nuanced representation of  collective emotion, which is then used to generate targeted  visual, auditory, and haptic stimuli. These stimuli are  designed to enhance user engagement and immersion in VR by  adding a layer of emotional interaction. The latter is  assessed through user experience while watching 360°  penalty event. The sound model architecture, which employs  Random Forest and XGBoost models that feed into a  meta-learner, achieves an accuracy of 98.71% on test set.  Meanwhile, the model for classifying human facial emotions,  tackling a challenging 7-class classification problem,  achieves an accuracy of 56.15%. While this shows promising  potential, incorporating additional visual elements such as  object detection and scene analysis could further enrich  the understanding of collective emotions and enhance the  robustness of our model. The results of our study indicate  that integrating these stimuli based on collective emotion  recognition significantly increases user immersion. Tests  with 10 participants demonstrate a notably pronounced  improvement when haptic feedback is involved, highlighting  the tactile dimension as an especially powerful channel for  conveying and amplifying emotions in VR environments. This  research has demonstrated that the integration of audio and  visual analyses can significantly enhance the performance  and robustness of crowd emotion detection models in VR  environments. By synthesizing these two input modalities,  we have been able to provide a more comprehensive  understanding of collective emotions, which in turn has  positively impacted the user’s immersion. These findings  underscore the potential for more sophisticated and  emotionally aware VR systems, suggesting that similar  approaches could be beneficial in advancing the field and  enriching the user experience across various applications.},
      url = {http://arodes.hes-so.ch/record/15235},
      doi = {https://doi.org/10.54941/ahfe1004687},
}