@conference {Brachmann2017, title = {DSAC - Differentiable RANSAC for camera localization}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017}, volume = {2017-Janua}, year = {2017}, month = {nov}, pages = {2492{\textendash}2500}, abstract = {RANSAC is an important algorithm in robust optimization and a central building block for many computer vision applications. In recent years, traditionally hand-crafted pipelines have been replaced by deep learning pipelines, which can be trained in an end-to-end fashion. However, RANSAC has so far not been used as part of such deep learning pipelines, because its hypothesis selection procedure is non-differentiable. In this work, we present two different ways to overcome this limitation. The most promising approach is inspired by reinforcement learning, namely to replace the deterministic hypothesis selection by a probabilistic selection for which we can derive the expected loss w.r.t. to all learnable parameters. We call this approach DSAC, the differentiable counterpart of RANSAC. We apply DSAC to the problem of camera localization, where deep learning has so far failed to improve on traditional approaches. We demonstrate that by directly minimizing the expected loss of the output camera poses, robustly estimated by RANSAC, we achieve an increase in accuracy. In the future, any deep learning pipeline can use DSAC as a robust optimization component.}, isbn = {9781538604571}, doi = {10.1109/CVPR.2017.267}, url = {http://arxiv.org/abs/1611.05705}, author = {Brachmann, Eric and Krull, Alexander and Nowozin, Sebastian and Shotton, Jamie and Michel, Frank and Gumhold, Stefan and Carsten Rother} } @conference {Michel2017, title = {Global hypothesis generation for 6D object pose estimation}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017}, volume = {2017-Janua}, year = {2017}, month = {dec}, pages = {115{\textendash}124}, abstract = {This paper addresses the task of estimating the 6D pose of a known 3D object from a single RGB-D image. Most modern approaches solve this task in three steps: i) Compute local features; ii) Generate a pool of pose-hypotheses; iii) Select and refine a pose from the pool. This work focuses on the second step. While all existing approaches generate the hypotheses pool via local reasoning, e.g. RANSAC or Hough-voting, we are the first to show that global reasoning is beneficial at this stage. In particular, we formulate a novel fully-connected Conditional Random Field (CRF) that outputs a very small number of pose-hypotheses. Despite the potential functions of the CRF being non-Gaussian, we give a new and efficient two-step optimization procedure, with some guarantees for optimality. We utilize our global hypotheses generation procedure to produce results that exceed state-of-the-art for the challenging "Occluded Object Dataset".}, isbn = {9781538604571}, doi = {10.1109/CVPR.2017.20}, url = {http://arxiv.org/abs/1612.02287}, author = {Michel, Frank and Kirillov, Alexander and Brachmann, Eric and Krull, Alexander and Gumhold, Stefan and Savchynskyy, Bogdan and Carsten Rother} } @conference {Krull2017, title = {PoseAgent: Budget-constrained 6D object pose estimation via reinforcement learning}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017}, volume = {2017-Janua}, year = {2017}, month = {dec}, pages = {2566{\textendash}2574}, abstract = {State-of-the-art computer vision algorithms often achieve efficiency by making discrete choices about which hypotheses to explore next. This allows allocation of computational resources to promising candidates, however, such decisions are non-differentiable. As a result, these algorithms are hard to train in an end-to-end fashion. In this work we propose to learn an efficient algorithm for the task of 6D object pose estimation. Our system optimizes the parameters of an existing state-of-the art pose estimation system using reinforcement learning, where the pose estimation system now becomes the stochastic policy, parametrized by a CNN. Additionally, we present an efficient training algorithm that dramatically reduces computation time. We show empirically that our learned pose estimation procedure makes better use of limited resources and improves upon the state-of-the-art on a challenging dataset. Our approach enables differentiable end-to-end training of complex algorithmic pipelines and learns to make optimal use of a given computational budget.}, isbn = {9781538604571}, doi = {10.1109/CVPR.2017.275}, url = {http://arxiv.org/abs/1612.03779}, author = {Krull, Alexander and Brachmann, Eric and Nowozin, Sebastian and Michel, Frank and Shotton, Jamie and Carsten Rother} } @booklet {Massiceti, title = {Random Forests versus Neural Networks - What{\textquoteright}s best for camera location}, year = {2017}, abstract = {This work addresses the task of camera localiza-tion in a known 3D scene given a single input RGB image. State-of-the-art approaches accomplish this in two steps: firstly, regressing for every pixel in the image its 3D scene coordinate and subsequently, using these coordinates to estimate the final 6D camera pose via RANSAC. To solve the first step, Random Forests (RFs) are typically used. On the other hand, Neural Networks (NNs) reign in many dense regression tasks, but are not test-time efficient. We ask the question: which of the two is best for camera localization? To address this, we make two method contributions: (1) a test-time efficient NN architecture which we term a ForestNet that is derived and initialized from a RF, and (2) a new fully-differentiable robust averaging technique for regression ensembles which can be trained end-to-end with a NN. Our experimental findings show that for scene coordinate regression, traditional NN architectures are superior to test-time efficient RFs and ForestNets, however, this does not translate to final 6D camera pose accuracy where RFs and ForestNets perform slightly better. To summarize, our best method, a ForestNet with a robust average, which has an equivalent fast and lightweight RF, improves over the state-of-the-art for camera localization on the 7-Scenes dataset [1]. While this work focuses on scene coordinate regression for camera localization, our innovations may also be applied to other continuous regression tasks.}, author = {Massiceti, Daniela and Krull, Alexander and Brachmann, Eric and Carsten Rother and Torr, Philip H S} } @conference {Brachmann2016, title = {Uncertainty-Driven 6D Pose Estimation of Objects and Scenes from a Single RGB Image}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, volume = {2016-Decem}, year = {2016}, pages = {3364{\textendash}3372}, abstract = {In recent years, the task of estimating the 6D pose of object instances and complete scenes, i.e. camera localization, from a single input image has received considerable attention. Consumer RGB-D cameras have made this feasible, even for difficult, texture-less objects and scenes. In this work, we show that a single RGB image is sufficient to achieve visually convincing results. Our key concept is to model and exploit the uncertainty of the system at all stages of the processing pipeline. The uncertainty comes in the form of continuous distributions over 3D object coordinates and discrete distributions over object labels. We give three technical contributions. Firstly, we develop a regularized, auto-context regression framework which iteratively reduces uncertainty in object coordinate and object label predictions. Secondly, we introduce an efficient way to marginalize object coordinate distributions over depth. This is necessary to deal with missing depth information. Thirdly, we utilize the distributions over object labels to detect multiple objects simultaneously with a fixed budget of RANSAC hypotheses. We tested our system for object pose estimation and camera localization on commonly used data sets. We see a major improvement over competing systems.}, isbn = {9781467388504}, issn = {10636919}, doi = {10.1109/CVPR.2016.366}, author = {Brachmann, Eric and Michel, Frank and Krull, Alexander and Yang, Michael Ying and Gumhold, Stefan and Carsten Rother} } @conference {Brachmann2016a, title = {Uncertainty-Driven 6D Pose Estimation of Objects and Scenes from a Single RGB Image}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, volume = {2016-Decem}, year = {2016}, pages = {3364{\textendash}3372}, abstract = {In recent years, the task of estimating the 6D pose of object instances and complete scenes, i.e. camera localization, from a single input image has received considerable attention. Consumer RGB-D cameras have made this feasible, even for difficult, texture-less objects and scenes. In this work, we show that a single RGB image is sufficient to achieve visually convincing results. Our key concept is to model and exploit the uncertainty of the system at all stages of the processing pipeline. The uncertainty comes in the form of continuous distributions over 3D object coordinates and discrete distributions over object labels. We give three technical contributions. Firstly, we develop a regularized, auto-context regression framework which iteratively reduces uncertainty in object coordinate and object label predictions. Secondly, we introduce an efficient way to marginalize object coordinate distributions over depth. This is necessary to deal with missing depth information. Thirdly, we utilize the distributions over object labels to detect multiple objects simultaneously with a fixed budget of RANSAC hypotheses. We tested our system for object pose estimation and camera localization on commonly used data sets. We see a major improvement over competing systems.}, isbn = {9781467388504}, issn = {10636919}, doi = {10.1109/CVPR.2016.366}, author = {Brachmann, Eric and Michel, Frank and Krull, Alexander and Yang, Michael Ying and Gumhold, Stefan and Carsten Rother} } @conference {Krull2015, title = {Learning analysis-by-synthesis for 6d pose estimation in RGB-D images}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision}, volume = {2015 Inter}, year = {2015}, pages = {954{\textendash}962}, abstract = {Analysis-by-synthesis has been a successful approach for many tasks in computer vision, such as 6D pose estimation of an object in an RGB-D image which is the topic of this work. The idea is to compare the observation with the output of a forward process, such as a rendered image of the object of interest in a particular pose. Due to occlusion or complicated sensor noise, it can be difficult to perform this comparison in a meaningful way. We propose an approach that "learns to compare", while taking these difficulties into account. This is done by describing the posterior density of a particular object pose with a convolutional neural network (CNN) that compares observed and rendered images. The network is trained with the maximum likelihood paradigm. We observe empirically that the CNN does not specialize to the geometry or appearance of specific objects. It can be used with objects of vastly different shapes and appearances, and in different backgrounds. Compared to state-of-the-art, we demonstrate a significant improvement on two different datasets which include a total of eleven objects, cluttered background, and heavy occlusion.}, isbn = {9781467383912}, issn = {15505499}, doi = {10.1109/ICCV.2015.115}, author = {Krull, Alexander and Brachmann, Eric and Michel, Frank and Yang, Michael Ying and Gumhold, Stefan and Carsten Rother} } @conference {Michel2015, title = {Pose Estimation of Kinematic Chain Instances via Object Coordinate Regression}, year = {2015}, pages = {181.1{\textendash}181.11}, abstract = {In this paper, we address the problem of one shot pose estimation of articulated ob-jects from an RGB-D image. In particular, we consider object instances with the topol-ogy of a kinematic chain, i.e. assemblies of rigid parts connected by prismatic or revolute joints. This object type occurs often in daily live, for instance in the form of furniture or electronic devices. Instead of treating each object part separately we are using the rela-tionship between parts of the kinematic chain and propose a new minimal pose sampling approach. This enables us to create a pose hypothesis for a kinematic chain consist-ing of K parts by sampling K 3D-3D point correspondences. To asses the quality of our method, we gathered a large dataset containing four objects and 7000+ annotated RGB-D frames 1 . On this dataset we achieve considerably better results than a modified state-of-the-art pose estimation system for rigid objects.}, doi = {10.5244/c.29.181}, author = {Michel, Frank and Krull, Alexander and Brachmann, Eric and Yang, Michael Ying and Gumhold, Stefan and Carsten Rother} }