@inproceedings{1af6e1fc664a4b74876025c1b99496cc,
title = "Mixed effects neural networks (menets) with applications to gaze estimation",
abstract = "There is much interest in computer vision to utilize commodity hardware for gaze estimation. A number of papers have shown that algorithms based on deep convolutional architectures are approaching accuracies where streaming data from mass-market devices can offer good gaze tracking performance, although a gap still remains between what is possible and the performance users will expect in real deployments. We observe that one obvious avenue for improvement relates to a gap between some basic technical assumptions behind most existing approaches and the statistical properties of the data used for training. Specifically, most training datasets involve tens of users with a few hundreds (or more) repeated acquisitions per user. The non i.i.d. nature of this data suggests better estimation may be possible if the model explicitly made use of such 'repeated measurements' from each user as is commonly done in classical statistical analysis using so-called mixed effects models. The goal of this paper is to adapt these 'mixed effects' ideas from statistics within a deep neural network architecture for gaze estimation, based on eye images. Such a formulation seeks to specifically utilize information regarding the hierarchical structure of the training data-each node in the hierarchy is a user who provides tens or hundreds of repeated samples. This modification yields an architecture that offers state of the art performance on various publicly available datasets improving results by 10-20%.",
keywords = "And Body Pose, Face, Gesture, Motion and Tracking",
author = "Yunyang Xiong and Kim, {Hyunwoo J.} and Vikas Singh",
note = "Funding Information: Most researchers performing data analysis know that the choice of the correct model for the data at hand can lead to improvements in performance, and conversely a suboptimal model can yield poor results. For appearance based gaze estimation, we explore how an appropriate statistical model that leverages information regarding repeated measurements from the same participant, a common feature of most if not all existing datasets, seems like a much better fit but has not been explored in computer vision much. To practicalize this observation within modern architectures, we propose a formulation that estimates a mixed effects model while leveraging the benefits of powerful deep neural networks. This conceptually simple idea leads to improvements (10-20% and more in some cases) over the state of the art on most gaze estimation datasets. Code and appendix are available at https://github.com/vsingh-group/MeNets. Acknowledgments. This work was supported by UW CPCP AI117924 and NSF CAREER award RI 1252725, and partially supported by R01 EB022883, R01 AG062336, R01 AG040396 and UW ADRC (AG033514). We thank Karu Sankaralingam for discussions, and Mona Jalal, Ronak Mehta, Ligang Zheng, Brandon M. Smith, Sukanya Venkataraman, Haoliang Sun, Xiaoming Zhang, Sathya Narayanan Ravi and Seong Jae Hwang for helping with various aspects of the experiments. Publisher Copyright: {\textcopyright} 2019 IEEE.; 32nd IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2019 ; Conference date: 16-06-2019 Through 20-06-2019",
year = "2019",
month = jun,
doi = "10.1109/CVPR.2019.00793",
language = "English",
series = "Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition",
publisher = "IEEE Computer Society",
pages = "7735--7744",
booktitle = "Proceedings - 2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2019",
}