@conference {Bhowmik2019, title = {Reinforced Feature Points: Optimizing Feature Detection and Description for a High-Level Task}, booktitle = {CVPR 2020 (oral)}, year = {2020}, abstract = {We address a core problem of computer vision: Detection and description of 2D feature points for image matching. For a long time, hand-crafted designs, like the seminal SIFT algorithm, were unsurpassed in accuracy and efficiency. Recently, learned feature detectors emerged that implement detection and description using neural networks. Training these networks usually resorts to optimizing low-level matching scores, often pre-defining sets of image patches which should or should not match, or which should or should not contain key points. Unfortunately, increased accuracy for these low-level matching scores does not necessarily translate to better performance in high-level vision tasks. We propose a new training methodology which embeds the feature detector in a complete vision pipeline, and where the learnable parameters are trained in an end-to-end fashion. We overcome the discrete nature of key point selection and descriptor matching using principles from reinforcement learning. As an example, we address the task of relative pose estimation between a pair of images. We demonstrate that the accuracy of a state-of-the-art learning-based feature detector can be increased when trained for the task it is supposed to solve at test time. Our training methodology poses little restrictions on the task to learn, and works for any architecture which predicts key point heat maps, and descriptors for key point locations.}, url = {http://arxiv.org/abs/1912.00623}, author = {Bhowmik, Aritra and Gumhold, Stefan and Rother, Carsten and Brachmann, Eric} } @conference {Leistner2019, title = {Learning to Think Outside the Box: Wide-Baseline Light Field Depth Estimation with EPI-Shift}, booktitle = {Proceedings - 2019 International Conference on 3D Vision, 3DV 2019}, year = {2019}, month = {sep}, pages = {249{\textendash}257}, abstract = {We propose a method for depth estimation from light field data, based on a fully convolutional neural network architecture. Our goal is to design a pipeline which achieves highly accurate results for small-and wide-baseline light fields. Since light field training data is scarce, all learning-based approaches use a small receptive field and operate on small disparity ranges. In order to work with wide-baseline light fields, we introduce the idea of EPI-Shift: To virtually shift the light field stack which enables to retain a small receptive field, independent of the disparity range. In this way, our approach {\textquoteright}learns to think outside the box of the receptive field". Our network performs joint classification of integer disparities and regression of disparity-offsets. A U-Net component provides excellent long-range smoothing. EPI-Shift considerably outperforms the state-of-the-art learning-based approaches and is on par with hand-crafted methods. We demonstrate this on a publicly available, synthetic, small-baseline benchmark and on large-baseline real-world recordings.}, keywords = {Computer vision, deep learning, depth estimation, light fields, Stereo}, isbn = {9781728131313}, doi = {10.1109/3DV.2019.00036}, url = {http://arxiv.org/abs/1909.09059 http://dx.doi.org/10.1109/3DV.2019.00036}, author = {Titus Leistner and Schilling, Hendrik and Mackowiak, Radek and Gumhold, Stefan and Carsten Rother} } @article {Bhowmik2019, title = {Reinforced Feature Points: Optimizing Feature Detection and Description for a High-Level Task}, year = {2019}, month = {dec}, abstract = {We address a core problem of computer vision: Detection and description of 2D feature points for image matching. For a long time, hand-crafted designs, like the seminal SIFT algorithm, were unsurpassed in accuracy and efficiency. Recently, learned feature detectors emerged that implement detection and description using neural networks. Training these networks usually resorts to optimizing low-level matching scores, often pre-defining sets of image patches which should or should not match, or which should or should not contain key points. Unfortunately, increased accuracy for these low-level matching scores does not necessarily translate to better performance in high-level vision tasks. We propose a new training methodology which embeds the feature detector in a complete vision pipeline, and where the learnable parameters are trained in an end-to-end fashion. We overcome the discrete nature of key point selection and descriptor matching using principles from reinforcement learning. As an example, we address the task of relative pose estimation between a pair of images. We demonstrate that the accuracy of a state-of-the-art learning-based feature detector can be increased when trained for the task it is supposed to solve at test time. Our training methodology poses little restrictions on the task to learn, and works for any architecture which predicts key point heat maps, and descriptors for key point locations.}, url = {http://arxiv.org/abs/1912.00623}, author = {Bhowmik, Aritra and Gumhold, Stefan and Carsten Rother and Brachmann, Eric} } @conference {Brachmann2017, title = {DSAC - Differentiable RANSAC for camera localization}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017}, volume = {2017-Janua}, year = {2017}, month = {nov}, pages = {2492{\textendash}2500}, abstract = {RANSAC is an important algorithm in robust optimization and a central building block for many computer vision applications. In recent years, traditionally hand-crafted pipelines have been replaced by deep learning pipelines, which can be trained in an end-to-end fashion. However, RANSAC has so far not been used as part of such deep learning pipelines, because its hypothesis selection procedure is non-differentiable. In this work, we present two different ways to overcome this limitation. The most promising approach is inspired by reinforcement learning, namely to replace the deterministic hypothesis selection by a probabilistic selection for which we can derive the expected loss w.r.t. to all learnable parameters. We call this approach DSAC, the differentiable counterpart of RANSAC. We apply DSAC to the problem of camera localization, where deep learning has so far failed to improve on traditional approaches. We demonstrate that by directly minimizing the expected loss of the output camera poses, robustly estimated by RANSAC, we achieve an increase in accuracy. In the future, any deep learning pipeline can use DSAC as a robust optimization component.}, isbn = {9781538604571}, doi = {10.1109/CVPR.2017.267}, url = {http://arxiv.org/abs/1611.05705}, author = {Brachmann, Eric and Krull, Alexander and Nowozin, Sebastian and Shotton, Jamie and Michel, Frank and Gumhold, Stefan and Carsten Rother} } @conference {Michel2017, title = {Global hypothesis generation for 6D object pose estimation}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017}, volume = {2017-Janua}, year = {2017}, month = {dec}, pages = {115{\textendash}124}, abstract = {This paper addresses the task of estimating the 6D pose of a known 3D object from a single RGB-D image. Most modern approaches solve this task in three steps: i) Compute local features; ii) Generate a pool of pose-hypotheses; iii) Select and refine a pose from the pool. This work focuses on the second step. While all existing approaches generate the hypotheses pool via local reasoning, e.g. RANSAC or Hough-voting, we are the first to show that global reasoning is beneficial at this stage. In particular, we formulate a novel fully-connected Conditional Random Field (CRF) that outputs a very small number of pose-hypotheses. Despite the potential functions of the CRF being non-Gaussian, we give a new and efficient two-step optimization procedure, with some guarantees for optimality. We utilize our global hypotheses generation procedure to produce results that exceed state-of-the-art for the challenging "Occluded Object Dataset".}, isbn = {9781538604571}, doi = {10.1109/CVPR.2017.20}, url = {http://arxiv.org/abs/1612.02287}, author = {Michel, Frank and Kirillov, Alexander and Brachmann, Eric and Krull, Alexander and Gumhold, Stefan and Savchynskyy, Bogdan and Carsten Rother} } @conference {Brachmann2016, title = {Uncertainty-Driven 6D Pose Estimation of Objects and Scenes from a Single RGB Image}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, volume = {2016-Decem}, year = {2016}, pages = {3364{\textendash}3372}, abstract = {In recent years, the task of estimating the 6D pose of object instances and complete scenes, i.e. camera localization, from a single input image has received considerable attention. Consumer RGB-D cameras have made this feasible, even for difficult, texture-less objects and scenes. In this work, we show that a single RGB image is sufficient to achieve visually convincing results. Our key concept is to model and exploit the uncertainty of the system at all stages of the processing pipeline. The uncertainty comes in the form of continuous distributions over 3D object coordinates and discrete distributions over object labels. We give three technical contributions. Firstly, we develop a regularized, auto-context regression framework which iteratively reduces uncertainty in object coordinate and object label predictions. Secondly, we introduce an efficient way to marginalize object coordinate distributions over depth. This is necessary to deal with missing depth information. Thirdly, we utilize the distributions over object labels to detect multiple objects simultaneously with a fixed budget of RANSAC hypotheses. We tested our system for object pose estimation and camera localization on commonly used data sets. We see a major improvement over competing systems.}, isbn = {9781467388504}, issn = {10636919}, doi = {10.1109/CVPR.2016.366}, author = {Brachmann, Eric and Michel, Frank and Krull, Alexander and Yang, Michael Ying and Gumhold, Stefan and Carsten Rother} } @conference {Brachmann2016a, title = {Uncertainty-Driven 6D Pose Estimation of Objects and Scenes from a Single RGB Image}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, volume = {2016-Decem}, year = {2016}, pages = {3364{\textendash}3372}, abstract = {In recent years, the task of estimating the 6D pose of object instances and complete scenes, i.e. camera localization, from a single input image has received considerable attention. Consumer RGB-D cameras have made this feasible, even for difficult, texture-less objects and scenes. In this work, we show that a single RGB image is sufficient to achieve visually convincing results. Our key concept is to model and exploit the uncertainty of the system at all stages of the processing pipeline. The uncertainty comes in the form of continuous distributions over 3D object coordinates and discrete distributions over object labels. We give three technical contributions. Firstly, we develop a regularized, auto-context regression framework which iteratively reduces uncertainty in object coordinate and object label predictions. Secondly, we introduce an efficient way to marginalize object coordinate distributions over depth. This is necessary to deal with missing depth information. Thirdly, we utilize the distributions over object labels to detect multiple objects simultaneously with a fixed budget of RANSAC hypotheses. We tested our system for object pose estimation and camera localization on commonly used data sets. We see a major improvement over competing systems.}, isbn = {9781467388504}, issn = {10636919}, doi = {10.1109/CVPR.2016.366}, author = {Brachmann, Eric and Michel, Frank and Krull, Alexander and Yang, Michael Ying and Gumhold, Stefan and Carsten Rother} } @conference {Krull2015, title = {Learning analysis-by-synthesis for 6d pose estimation in RGB-D images}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision}, volume = {2015 Inter}, year = {2015}, pages = {954{\textendash}962}, abstract = {Analysis-by-synthesis has been a successful approach for many tasks in computer vision, such as 6D pose estimation of an object in an RGB-D image which is the topic of this work. The idea is to compare the observation with the output of a forward process, such as a rendered image of the object of interest in a particular pose. Due to occlusion or complicated sensor noise, it can be difficult to perform this comparison in a meaningful way. We propose an approach that "learns to compare", while taking these difficulties into account. This is done by describing the posterior density of a particular object pose with a convolutional neural network (CNN) that compares observed and rendered images. The network is trained with the maximum likelihood paradigm. We observe empirically that the CNN does not specialize to the geometry or appearance of specific objects. It can be used with objects of vastly different shapes and appearances, and in different backgrounds. Compared to state-of-the-art, we demonstrate a significant improvement on two different datasets which include a total of eleven objects, cluttered background, and heavy occlusion.}, isbn = {9781467383912}, issn = {15505499}, doi = {10.1109/ICCV.2015.115}, author = {Krull, Alexander and Brachmann, Eric and Michel, Frank and Yang, Michael Ying and Gumhold, Stefan and Carsten Rother} } @conference {Michel2015, title = {Pose Estimation of Kinematic Chain Instances via Object Coordinate Regression}, year = {2015}, pages = {181.1{\textendash}181.11}, abstract = {In this paper, we address the problem of one shot pose estimation of articulated ob-jects from an RGB-D image. In particular, we consider object instances with the topol-ogy of a kinematic chain, i.e. assemblies of rigid parts connected by prismatic or revolute joints. This object type occurs often in daily live, for instance in the form of furniture or electronic devices. Instead of treating each object part separately we are using the rela-tionship between parts of the kinematic chain and propose a new minimal pose sampling approach. This enables us to create a pose hypothesis for a kinematic chain consist-ing of K parts by sampling K 3D-3D point correspondences. To asses the quality of our method, we gathered a large dataset containing four objects and 7000+ annotated RGB-D frames 1 . On this dataset we achieve considerably better results than a modified state-of-the-art pose estimation system for rigid objects.}, doi = {10.5244/c.29.181}, author = {Michel, Frank and Krull, Alexander and Brachmann, Eric and Yang, Michael Ying and Gumhold, Stefan and Carsten Rother} }