@conference {Jafari2017,
	title = {Analyzing modular CNN architectures for joint depth prediction and semantic segmentation},
	booktitle = {Proceedings - IEEE International Conference on Robotics and Automation},
	year = {2017},
	month = {feb},
	pages = {4620{\textendash}4627},
	abstract = {This paper addresses the task of designing a modular neural network architecture that jointly solves different tasks. As an example we use the tasks of depth estimation and semantic segmentation given a single RGB image. The main focus of this work is to analyze the cross-modality influence between depth and semantic prediction maps on their joint refinement. While most of the previous works solely focus on measuring improvements in accuracy, we propose a way to quantify the cross-modality influence. We show that there is a relationship between final accuracy and cross-modality influence, although not a simple linear one. Hence a larger cross-modality influence does not necessarily translate into an improved accuracy. We find that a beneficial balance between the cross-modality influences can be achieved by network architecture and conjecture that this relationship can be utilized to understand different network design choices. Towards this end we propose a Convolutional Neural Network (CNN) architecture that fuses the state-of-the-art results for depth estimation and semantic labeling. By balancing the cross-modality influences between depth and semantic prediction, we achieve improved results for both tasks using the NYU-Depth v2 benchmark.},
	isbn = {9781509046331},
	issn = {10504729},
	doi = {10.1109/ICRA.2017.7989537},
	url = {http://arxiv.org/abs/1702.08009 http://dx.doi.org/10.1109/ICRA.2017.7989537},
	author = {Omid Hosseini Jafari and Groth, Oliver and Kirillov, Alexander and Yang, Michael Ying and Carsten Rother}
}