@article {Arnab2018, title = {Conditional Random Fields Meet Deep Neural Networks for Semantic Segmentation}, journal = {Cvpr}, volume = {XX}, number = {Xx}, year = {2018}, pages = {1{\textendash}15}, abstract = {{\textemdash}Semantic Segmentation is the task of labelling every pixel in an image with a pre-defined object category. It has numer-ous applications in scenarios where the detailed understanding of an image is required, such as in autonomous vehicles and medical diagnosis. This problem has traditionally been solved with probabilistic models known as Conditional Random Fields (CRFs) due to their ability to model the relationships between the pixels being predicted. However, Deep Neural Networks (DNNs) have recently been shown to excel at a wide range of computer vision problems due to their ability to learn rich feature representations automatically from data, as opposed to traditional hand-crafted features. The idea of combining CRFs and DNNs have achieved state-of-the-art results in a number of domains. We review the literature on combining the modelling power of CRFs with the representation-learning ability of DNNs, ranging from early work that combines these two techniques as independent stages of a common pipeline to recent approaches that embed inference of probabilistic models directly in the neural network itself. Finally, we summarise future research directions.}, keywords = {conditional random fields, deep learning, seman-}, url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.308.8889\&rep=rep1\&type=pdf\%0Ahttp://dx.doi.org/10.1109/CVPR.2012.6248050}, author = {Arnab, Anurag and Zheng, Shuai and Jayasumana, Sadeep and Romera-paredes, Bernardino and Kirillov, Alexander and Savchynskyy, Bogdan and Carsten Rother and Kahl, Fredrik and Torr, Philip} } @conference {Jafari2017, title = {Analyzing modular CNN architectures for joint depth prediction and semantic segmentation}, booktitle = {Proceedings - IEEE International Conference on Robotics and Automation}, year = {2017}, month = {feb}, pages = {4620{\textendash}4627}, abstract = {This paper addresses the task of designing a modular neural network architecture that jointly solves different tasks. As an example we use the tasks of depth estimation and semantic segmentation given a single RGB image. The main focus of this work is to analyze the cross-modality influence between depth and semantic prediction maps on their joint refinement. While most of the previous works solely focus on measuring improvements in accuracy, we propose a way to quantify the cross-modality influence. We show that there is a relationship between final accuracy and cross-modality influence, although not a simple linear one. Hence a larger cross-modality influence does not necessarily translate into an improved accuracy. We find that a beneficial balance between the cross-modality influences can be achieved by network architecture and conjecture that this relationship can be utilized to understand different network design choices. Towards this end we propose a Convolutional Neural Network (CNN) architecture that fuses the state-of-the-art results for depth estimation and semantic labeling. By balancing the cross-modality influences between depth and semantic prediction, we achieve improved results for both tasks using the NYU-Depth v2 benchmark.}, isbn = {9781509046331}, issn = {10504729}, doi = {10.1109/ICRA.2017.7989537}, url = {http://arxiv.org/abs/1702.08009 http://dx.doi.org/10.1109/ICRA.2017.7989537}, author = {Omid Hosseini Jafari and Groth, Oliver and Kirillov, Alexander and Yang, Michael Ying and Carsten Rother} } @conference {Michel2017, title = {Global hypothesis generation for 6D object pose estimation}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017}, volume = {2017-Janua}, year = {2017}, month = {dec}, pages = {115{\textendash}124}, abstract = {This paper addresses the task of estimating the 6D pose of a known 3D object from a single RGB-D image. Most modern approaches solve this task in three steps: i) Compute local features; ii) Generate a pool of pose-hypotheses; iii) Select and refine a pose from the pool. This work focuses on the second step. While all existing approaches generate the hypotheses pool via local reasoning, e.g. RANSAC or Hough-voting, we are the first to show that global reasoning is beneficial at this stage. In particular, we formulate a novel fully-connected Conditional Random Field (CRF) that outputs a very small number of pose-hypotheses. Despite the potential functions of the CRF being non-Gaussian, we give a new and efficient two-step optimization procedure, with some guarantees for optimality. We utilize our global hypotheses generation procedure to produce results that exceed state-of-the-art for the challenging "Occluded Object Dataset".}, isbn = {9781538604571}, doi = {10.1109/CVPR.2017.20}, url = {http://arxiv.org/abs/1612.02287}, author = {Michel, Frank and Kirillov, Alexander and Brachmann, Eric and Krull, Alexander and Gumhold, Stefan and Savchynskyy, Bogdan and Carsten Rother} } @conference {Kirillov2017a, title = {InstanceCut: From edges to instances with MultiCut}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017}, volume = {2017-Janua}, year = {2017}, pages = {7322{\textendash}7331}, abstract = {This work addresses the task of instance-aware semantic segmentation. Our key motivation is to design a simple method with a new modelling-paradigm, which therefore has a different trade-off between advantages and disadvantages compared to known approaches.Our approach, we term InstanceCut, represents the problem by two output modalities: (i) an instance-agnostic semantic segmentation and (ii) all instance-boundaries. The former is computed from a standard convolutional neural network for semantic segmentation, and the latter is derived from a new instanceaware edge detection model. To reason globally about the optimal partitioning of an image into instances, we combine these two modalities into a novel MultiCut formulation. We evaluate our approach on the challenging CityScapes dataset. Despite the conceptual simplicity of our approach, we achieve the best result among all published methods, and perform particularly well for rare object classes.}, isbn = {9781538604571}, doi = {10.1109/CVPR.2017.774}, author = {Kirillov, Alexander and Levinkov, Evgeny and Bj{\"o}rn Andres and Savchynskyy, Bogdan and Carsten Rother} } @conference {Levinkov2017, title = {Joint graph decomposition \& node labeling: Problem, algorithms, applications}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017}, volume = {2017-Janua}, year = {2017}, pages = {1904{\textendash}1912}, abstract = {We state a combinatorial optimization problem whose feasible solutions define both a decomposition and a node labeling of a given graph. This problem offers a common mathematical abstraction of seemingly unrelated computer vision tasks, including instance-separating semantic segmentation, articulated human body pose estimation and multiple object tracking. Conceptually, the problem we state generalizes the unconstrained integer quadratic program and the minimum cost lifted multicut problem, both of which are NP-hard. In order to find feasible solutions efficiently, we define two local search algorithms that converge monotonously to a local optimum, offering a feasible solution at any time. To demonstrate their effectiveness in tackling computer vision tasks, we apply these algorithms to instances of the problem that we construct from published data, using published algorithms. We report state-of-the-art application-specific accuracy for the three above-mentioned applications.}, isbn = {9781538604571}, doi = {10.1109/CVPR.2017.206}, author = {Levinkov, Evgeny and Uhrig, Jonas and Tang, Siyu and Omran, Mohamed and Insafutdinov, Eldar and Kirillov, Alexander and Carsten Rother and Brox, Thomas and Schiele, Bernt and Bj{\"o}rn Andres} } @conference {Kirillov2015a, title = {Inferring M-best diverse labelings in a single one}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision}, volume = {2015 Inter}, year = {2015}, pages = {1814{\textendash}1822}, abstract = {We consider the task of finding M-best diverse solutions in a graphical model. In a previous work by Batra et al. an algorithmic approach for finding such solutions was proposed, and its usefulness was shown in numerous applications. Contrary to previous work we propose a novel formulation of the problem in form of a single energy minimization problem in a specially constructed graphical model. We show that the method of Batra et al. can be considered as a greedy approximate algorithm for our model, whereas we introduce an efficient specialized optimization technique for it, based on alpha-expansion. We evaluate our method on two application scenarios, interactive and semantic image segmentation, with binary and multiple labels. In both cases we achieve considerably better error rates than state-of-the art diversity methods. Furthermore, we empirically discover that in the binary label case we were able to reach global optimality for all test instances.}, isbn = {9781467383912}, issn = {15505499}, doi = {10.1109/ICCV.2015.211}, author = {Kirillov, Alexander and Savchynskyy, Bogdan and Schlesinger, Dmitrij and Vetrov, Dmitry and Carsten Rother} } @conference {Kirillov2015, title = {M-best-diverse labelings for submodular energies and beyond}, booktitle = {Advances in Neural Information Processing Systems}, volume = {2015-Janua}, year = {2015}, pages = {613{\textendash}621}, abstract = {We consider the problem of findingM best diverse solutions of energy minimization problems for graphical models. Contrary to the sequential method of Batra et al., which greedily finds one solution after another, we infer all M solutions jointly. It was shown recently that such jointly inferred labelings not only have smaller total energy but also qualitatively outperform the sequentially obtained ones. The only obstacle for using this new technique is the complexity of the corresponding inference problem, since it is considerably slower algorithm than the method of Batra et al. In this work we show that the joint inference of M best diverse solutions can be formulated as a submodular energy minimization if the original MAP-inference problem is submodular, hence fast inference techniques can be used. In addition to the theoretical results we provide practical algorithms that outperform the current state-of-the-art and can be used in both submodular and non-submodular case.}, issn = {10495258}, author = {Kirillov, Alexander and Schlesinger, Dmitrij and Vetrov, Dmitry and Carsten Rother and Savchynskyy, Bogdan} }