K-Nearest Neighbor Graph Testing Library
C++ Python library that is able to import exisiting NN-structures; Implements Property Testing Algorithm that rejects with high probability if queries to given structure are epsilon-far from giving a K-Nearest Neighbor Graph
knn_graph.hpp
1 /*
2 Copyright 2018 Dennis Rohde
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 
6 The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 
8 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
9 */
10 
11 #pragma once
12 
13 #include <algorithm>
14 #include <vector>
15 #include <limits>
16 #include <cmath>
17 
18 #include <boost/python/numpy.hpp>
19 
20 #include "relation.hpp"
21 
22 namespace np = boost::python::numpy;
23 
24 template <typename T = double>
25 class KNN_Graph {
26 public:
27  typedef unsigned long long index_type;
28 
29  class Adjacency_List : public std::vector<index_type> {
30  public:
31  inline auto get(const index_type i) const {
32  if (i >= this->size()) return std::numeric_limits<index_type>::signaling_NaN();
33  return (*this)[i];
34  }
35 
36  inline auto length() const {
37  return this->size();
38  }
39 
40  inline auto pbegin() const {
41  return this->begin();
42  }
43 
44  inline auto pend() const {
45  return this->end();
46  }
47  };
48 
49  typedef Tuple<T> location_type;
50  typedef Relation<T> vertices_type;
52  typedef std::vector<adjacency_list_type> edges_type;
53 
54  inline static T euclidean_distance(const location_type &a, const location_type &b) {
55  auto diff = a - b;
56  return sqrt(diff * diff);
57  }
58 
59  inline static T euclidean_distance_squared(const location_type &a, const location_type &b) {
60  auto diff = a - b;
61  return diff * diff;
62  }
63 
64  KNN_Graph() : k{0}, vertices{}, edges{} {}
65 
66  KNN_Graph(const unsigned long k) : k{k}, vertices{}, edges{} {}
67 
68  void sort() {
69  std::vector<index_type> index(number_vertices());
70 
71  #pragma omp parallel for shared(index)
72  for (index_type i = 0; i < number_vertices(); ++i) {
73  index[i] = i;
74  }
75 
76  auto less = [&](const index_type lhs, const index_type rhs) -> bool {
77  return vertices[lhs] < vertices[rhs];
78  };
79 
80  std::sort(index.begin(), index.end(), less);
81 
82  vertices_type new_vertices(number_vertices());
83  edges_type new_edges(number_vertices());
84 
85  #pragma omp parallel for shared(new_vertices, new_edges, index)
86  for (index_type i = 0; i < number_vertices(); ++i) {
87  new_vertices[i] = vertices[index[i]];
88  new_edges[i] = edges[index[i]];
89  std::sort(new_edges[i].begin(), new_edges[i].end(), less);
90  }
91 
92  vertices = new_vertices;
93  edges = new_edges;
94  }
95 
96  auto epsilon(const KNN_Graph<T> &hp) const {
97  double epsilon = 0.0;
98  auto g = *this;
99  auto h = hp;
100 
101  g.sort();
102  h.sort();
103 
104  index_type i = 0, j = 0;
105  unsigned long long denominator = 0;
106 
107  while (i < g.number_vertices() and j < h.number_vertices()) {
108  if (g.get_vertex(i) == h.get_vertex(j)) {
109  auto g_neighbors = g.get_neighbors(i);
110  auto h_neighbors = h.get_neighbors(j);
111  auto g_number_neighbors = g_neighbors.size();
112  auto h_number_neighbors = h_neighbors.size();
113  denominator += g_number_neighbors;
114  index_type k = 0, l = 0;
115  while (k < g_number_neighbors and l < h_number_neighbors) {
116  if (g_neighbors[k] < h_neighbors[l]) {
117  epsilon += 1;
118  ++k;
119  } else if (g_neighbors[k] > h_neighbors[l]) {
120  ++l;
121  } else {
122  ++k;
123  ++l;
124  }
125  }
126  epsilon += g_number_neighbors - k;
127  ++i;
128  ++j;
129  } else if (g.get_vertex(i) < h.get_vertex(j)) {
130  auto number_neighbors = g.get_neighbors(i).size();
131  epsilon += number_neighbors;
132  denominator += number_neighbors;
133  ++i;
134  } else {
135  ++j;
136  }
137  }
138 
139  for(; i < g.number_vertices(); ++i) {
140  auto number_neighbors = g.get_neighbors(i).size();
141  epsilon += number_neighbors;
142  denominator += number_neighbors;
143  }
144 
145  epsilon /= denominator;
146 
147  return epsilon;
148  }
149 
150  inline auto dimension() const {
151  return number_vertices() == 0 ? 0 : vertices[0].dimension();
152  }
153 
154  inline auto number_vertices() const {
155  return vertices.size();
156  }
157 
158  inline auto number_edges() const {
159  return edges_number;
160  }
161 
162  inline auto number_wrongly_connected_vertices() const {
163  unsigned long long result = 0;
164 
165  #pragma omp parallel for shared(result)
166  for (index_type i = 0; i < number_vertices(); ++i) {
167  T distN = 0;
168  auto wrongly_connected = false;
169  auto &adj_list = edges[i];
170 
171  for (index_type j = 0; j < adj_list.size(); ++j) {
172  auto dist = euclidean_distance_squared(vertices[i], vertices[adj_list[j]]);
173  if (std::fabs(dist - distN) > std::numeric_limits<T>::epsilon() and dist > distN) {
174  distN = dist;
175  }
176  }
177 
178  for (index_type j = 0; j < number_vertices(); ++j) {
179  if (wrongly_connected) continue;
180  else if (std::find(adj_list.begin(), adj_list.end(), j) == adj_list.end() and i != j) {
181  auto dist = euclidean_distance_squared(vertices[i], vertices[j]);
182  if (std::fabs(dist - distN) > std::numeric_limits<T>::epsilon() and dist < distN) {
183  #pragma omp critical
184  {
185  if (not wrongly_connected) {
186  ++result;
187  wrongly_connected = true;
188  }
189  }
190  }
191  }
192  }
193  }
194  return result;
195  }
196 
197  inline auto get_vertex(const index_type i) const {
198  return vertices[i];
199  }
200 
201  inline auto get_neighbors(const index_type i) const {
202  return edges[i];
203  }
204 
205  inline auto number_neighbors(const index_type i) const {
206  return edges[i].size();
207  }
208 
209  inline auto get_k() const {
210  return k;
211  }
212 
213  void add_edge(const index_type i, const index_type j) {
214  this->edges[i].push_back(j);
215  this->edges_number++;
216  }
217 
218  virtual void build(const vertices_type &vertices) {
219  this->vertices = vertices;
220  this->edges = edges_type(vertices.size());
221  }
222 
223  virtual void build(const np::ndarray &in) {
224  this->vertices = vertices_type{in};
225  this->edges = edges_type(vertices.size());
226  }
227 
228  void edges_from_ndarray(const np::ndarray &in) {
229  auto dimensions = in.get_nd();
230  if (dimensions != 2 or in.get_dtype() != np::dtype::get_builtin<bool>()) {
231  std::cerr << "Need 2-dimensional numpy array of type bool!" << std::endl;
232  return;
233  }
234  this->edges = edges_type(vertices.size());
235  auto size = in.get_shape();
236  auto strides = in.get_strides();
237  auto data = in.get_data();
238  bool adjacent = false;
239  for (index_type i = 0; i < size[0]; ++i) {
240  for (index_type j = 0; j < size[1]; ++j) {
241  adjacent = *reinterpret_cast<const bool*>(data + i * strides[0] + j * strides[1]);
242  if (adjacent) {
243  this->edges[i].push_back(j);
244  ++this->edges_number;
245  }
246  }
247  }
248  }
249 
250  inline const auto& get_edges() const {
251  return edges;
252  }
253 
254  inline const auto& get_vertices() const {
255  return vertices;
256  }
257 
258  inline auto& get_edges() {
259  return edges;
260  }
261 
262  inline auto& get_vertices() {
263  return vertices;
264  }
265 
266  inline auto edges_begin() const {
267  return edges.begin();
268  }
269 
270  inline auto edges_end() const {
271  return edges.end();
272  }
273 
274  inline auto vertices_begin() const {
275  return vertices.begin();
276  }
277 
278  inline auto vertices_end() const {
279  return vertices.end();
280  }
281 
282  inline auto as_str() const {
283  std::stringstream ss;
284  ss << *this;
285  return ss.str();
286  }
287 
288  inline auto repr() const {
289  std::stringstream ss;
290  ss << get_k() <<"-Nearest Neighbor Graph of Dimension " << dimension() << ":" << std::endl;
291  ss << as_str();
292  return ss.str();
293  }
294 
295 protected:
296  unsigned long k;
297  unsigned long long edges_number = 0;
298  vertices_type vertices;
299  edges_type edges;
300 
301 };
302 
303 template <typename T = double>
304 class KNN_Graph_Exact : public KNN_Graph<T> {
305  typedef KNN_Graph<T> super;
306  typedef typename super::index_type index_type;
307 
308 public:
309  KNN_Graph_Exact(const unsigned long k = 10) {
310  this->k = k;
311  this->vertices = typename super::vertices_type{};
312  this->edges = typename super::edges_type{};
313  }
314 
315  void build(const np::ndarray &in) {
316  this->build(typename super::vertices_type(in));
317  }
318 
319  void build(const typename super::vertices_type &vertices) {
320  this->vertices = vertices;
321  this->edges = typename super::edges_type(vertices.size());
322  std::cout << "Building exact " << this->k << "-NNGraph with " << vertices.size() << " vertices:" << std::endl;
323 
324  auto n = vertices.size();
325 
326  for (index_type i = 0; i < n; ++i) {
327  auto distance_furthest = std::numeric_limits<double>::infinity();
328 
329  for (index_type j = 0; j < n; ++j) {
330 
331  if (i != j) {
332 
333  auto dist = this->euclidean_distance_squared(this->vertices[i], this->vertices[j]);
334 
335  if (this->edges[i].length() >= this->k) {
336  if (dist < distance_furthest) {
337  index_type furthest = 0;
338  index_type neighbor = 0;
339  index_type furthest_neighbor = 0;
340 
341  for (index_type l = 0; l < this->edges[i].length(); ++l) {
342  neighbor = this->edges[i][l];
343  furthest_neighbor = this->edges[i][furthest];
344 
345  if (this->euclidean_distance_squared(this->vertices[i], this->vertices[furthest_neighbor]) < this->euclidean_distance_squared(this->vertices[i], this->vertices[neighbor])) {
346  furthest = l;
347  }
348  }
349  this->edges[i][furthest] = j;
350 
351  furthest = 0;
352  for (index_type l = 0; l < this->edges[i].length(); ++l) {
353  neighbor = this->edges[i][l];
354  furthest_neighbor = this->edges[i][furthest];
355 
356  if (this->euclidean_distance_squared(this->vertices[i], this->vertices[furthest_neighbor]) < this->euclidean_distance_squared(this->vertices[i], this->vertices[neighbor])) {
357  furthest = l;
358  }
359  }
360  distance_furthest = this->euclidean_distance_squared(this->vertices[i], this->vertices[this->edges[i][furthest]]);
361  }
362  } else {
363  if (this->edges[i].length() == 0) {
364  distance_furthest = dist;
365  } else {
366  distance_furthest = std::max(dist, distance_furthest);
367  }
368  this->edges[i].push_back(j);
369  }
370  }
371  }
372 
373  }
374  this->edges_number = n * this->k;
375  std::cout << "Exact " << this->k << "-NNGraph built." << std::endl;
376  }
377 };
378 
379 template <typename T = double>
380 std::ostream& operator<<(std::ostream &out, const typename KNN_Graph<T>::Adjacency_List &a) {
381  const auto size = a.size();
382  for (auto i = size-size; i < size-1; ++i) {
383  out << a.get(i);
384  out << ", ";
385  }
386  out << a.get(size-1) << std::endl;
387  return out;
388 }
389 
390 template <typename T>
391 std::ostream& operator<<(std::ostream &out, const KNN_Graph<T> &g) {
392  const auto size = g.number_vertices();
393  for (auto i = size-size; i < size; ++i) {
394  out << i << "-> ";
395  out << g.get_neighbors(i);
396  }
397  return out;
398 }
Definition: knn_graph.hpp:29
Definition: knn_graph.hpp:304
Definition: knn_graph.hpp:25
Definition: relation.hpp:24
Definition: d_dim_tuple.hpp:24