Skip to content

Commit

Permalink
Fixes in Clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
derohde committed Aug 20, 2021
1 parent 2c7089d commit 1103cc2
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 75 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "pybind11"]
path = pybind11
url = https://github.com/pybind/pybind11.git
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ By default, Fred will automatically determine the number of threads to use. If y
### Curves
- signature: `fred.Curves()`
- methods: `fred.Curves.add(curve)`: add curve, `fred.Curves[i]`: get ith curve, `len(fred.Curves)`: number curves, `fred.Curves.simplify(l)`: return set of simplified curves
- properties: `fred.Curves.m`: maximum complexity of the contained curves
- properties: `fred.Curves.m`: maximum complexity of the contained curves, `fred.Curves.values`: curves as `np.ndarray`

#### continous Fréchet distance
- signature: `fred.continuous_frechet(curve1, curve2)`
Expand All @@ -32,6 +32,10 @@ By default, Fred will automatically determine the number of threads to use. If y
- signature: `fred.discrete_frechet(curve1, curve2)`
- returns: `fred.Discrete_Frechet_Result` with members `value` and `time`

#### discrete dynamic time warping distance
- signature: `fred.discrete_dynamic_time_warping(curve1, curve2)`
- returns: `fred.Discrete_Dynamic_Time_Warping_Distance_Result` with members `value` and `time`

### Curve Simplification

#### weak minimum error simplification
Expand Down Expand Up @@ -69,7 +73,7 @@ A `fred.Distance_Matrix()` can be used to speed up consecutive calls of `fred.di
- `assignment`: empty if compute_assignment has not been called

#### discrete (k,l)-median clustering (continuous Fréchet)
- Algorithm 6 in [**Coresets for (k,l)-Clustering under the Fréchet distance**](https://arxiv.org/pdf/1901.01870.pdf) + simplification
- Algorithm from section 4.3 in [**Geometric Approximation Algorithms**](http://www.ams.org/books/surv/173/) + simplification
- signature: `fred.discrete_klmedian_multi(k, l, curves, distances, center_domain)` with parameters
- `k`: number of centers
- `l`: maximum complexity of the centers, only used when center_domain is default value
Expand Down
69 changes: 45 additions & 24 deletions include/clustering.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,36 +128,43 @@ struct Clustering_Result {
}
};


Clustering_Result gonzalez(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, Distance_Matrix &distances, const bool arya = false, const Curves &center_domain = Curves(), const bool random_start_center = true) {
Clustering_Result kl_center(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, Distance_Matrix &distances, const bool local_search = false, const Curves &center_domain = Curves(), const bool random_start_center = true) {

const auto start = std::chrono::high_resolution_clock::now();
Clustering_Result result;

if (in.empty()) return result;

std::vector<curve_number_t> centers;
const Curves &simplified_in = center_domain;
Curves &simplified_in = const_cast<Curves&>(center_domain);
bool self_simplify = false;

if (center_domain.empty()) {
self_simplify = true;
Curves simplified_in_self(in.number(), ell, in.dimensions());

for (curve_number_t i = 0; i < in.size(); ++i) {
Simplification::Subcurve_Shortcut_Graph graph(const_cast<Curve&>(in[i]));
auto simplified_curve = graph.weak_minimum_error_simplification(ell);
simplified_curve.set_name("Simplification of " + in[i].get_name());
simplified_in_self[i] = simplified_curve;
}
const_cast<Curves&>(simplified_in) = simplified_in_self;
simplified_in = simplified_in_self;
}

if (random_start_center) {

Random::Uniform_Random_Generator<double> ugen;
const curve_number_t r = std::floor(simplified_in.size() * ugen.get());
if (self_simplify) {
Simplification::Subcurve_Shortcut_Graph graph(const_cast<Curve&>(in[r]));
auto simplified_curve = graph.weak_minimum_error_simplification(ell);
simplified_curve.set_name("Simplification of " + in[r].get_name());
simplified_in[r] = simplified_curve;
}
centers.push_back(r);

} else centers.push_back(0);
} else {
if (self_simplify) {
Simplification::Subcurve_Shortcut_Graph graph(const_cast<Curve&>(in[0]));
auto simplified_curve = graph.weak_minimum_error_simplification(ell);
simplified_curve.set_name("Simplification of " + in[0].get_name());
simplified_in[0] = simplified_curve;
}
centers.push_back(0);
}

distance_t curr_maxdist = 0;
curve_number_t curr_maxcurve = 0;
Expand Down Expand Up @@ -188,35 +195,49 @@ Clustering_Result gonzalez(const curve_number_t num_centers, const curve_size_t
std::cout << "found center no. " << i+1 << std::endl;
#endif

if (self_simplify and simplified_in[curr_maxcurve].empty()) {
Simplification::Subcurve_Shortcut_Graph graph(const_cast<Curve&>(in[curr_maxcurve]));
auto simplified_curve = graph.weak_minimum_error_simplification(ell);
simplified_curve.set_name("Simplification of " + in[curr_maxcurve].get_name());
simplified_in[curr_maxcurve] = simplified_curve;
}
centers.push_back(curr_maxcurve);
}
}
}

if (arya) {
if (local_search) {

auto cost = _center_cost_sum(in, simplified_in, centers, distances);
auto approxcost = cost;
auto gamma = 1/(std::log(in.size()) * num_centers);
auto found = true;
distance_t cost = _center_cost_sum(in, simplified_in, centers, distances);
distance_t approxcost = cost;
distance_t curr_cost = cost;
distance_t gamma = 1/(10 * num_centers);
bool found = true;
auto curr_centers = centers;

// try to improve current solution
while (found) {
found = false;

// go through all centers
for (curve_number_t i = 0; i < num_centers; ++i) {
auto curr_centers = centers;
curr_centers = centers;

// check if there is a better center among all other curves
for (curve_number_t j = 0; j < simplified_in.size(); ++j) {
// continue if curve is already part of center set
if (std::find(curr_centers.begin(), curr_centers.end(), j) != curr_centers.end()) continue;

// swap
if (self_simplify and simplified_in[j].empty()) {
Simplification::Subcurve_Shortcut_Graph graph(const_cast<Curve&>(in[j]));
auto simplified_curve = graph.weak_minimum_error_simplification(ell);
simplified_curve.set_name("Simplification of " + in[j].get_name());
simplified_in[j] = simplified_curve;
}
curr_centers[i] = j;
// new cost
const auto curr_cost = _center_cost_sum(in, simplified_in, curr_centers, distances);
curr_cost = _center_cost_sum(in, simplified_in, curr_centers, distances);
// check if improvement is done
if (curr_cost < cost - gamma * approxcost) {
cost = curr_cost;
Expand All @@ -239,15 +260,15 @@ Clustering_Result gonzalez(const curve_number_t num_centers, const curve_size_t
return result;
}

Clustering_Result arya(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, Distance_Matrix &distances, const Curves &center_domain = Curves()) {
return gonzalez(num_centers, ell, in, distances, true, center_domain, false);
Clustering_Result kl_median(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, Distance_Matrix &distances, const Curves &center_domain = Curves()) {
return kl_center(num_centers, ell, in, distances, true, center_domain, false);
}

Clustering_Result one_median_sampling(const curve_size_t ell, const Curves &in, const double epsilon, const Curves &center_domain = Curves()) {
const auto start = std::chrono::high_resolution_clock::now();
Clustering_Result result;
std::vector<curve_number_t> centers;
const Curves &simplified_in = center_domain;
Curves &simplified_in = const_cast<Curves&>(center_domain);

if (center_domain.empty()) {
Curves simplified_in_self(in.number(), ell, in.dimensions());
Expand All @@ -258,7 +279,7 @@ Clustering_Result one_median_sampling(const curve_size_t ell, const Curves &in,
simplified_curve.set_name("Simplification of " + in[i].get_name());
simplified_in_self[i] = simplified_curve;
}
const_cast<Curves&>(simplified_in) = simplified_in_self;
simplified_in = simplified_in_self;
}

const auto n = in.size();
Expand Down
59 changes: 27 additions & 32 deletions include/coreset.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,37 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI

namespace Coreset {

class Onemedian_Coreset {
class K_Median_Coreset {

std::vector<curve_number_t> coreset;
std::vector<distance_t> lambda;
const distance_t Lambda = 76;
std::vector<parameter_t> lambda;
distance_t Lambda;
distance_t cost;

public:
Onemedian_Coreset() {}
K_Median_Coreset() {}

inline Onemedian_Coreset(const curve_size_t ell, const Curves &in, const distance_t epsilon, const double constant = 1) {
compute(ell, in, epsilon, constant);
inline K_Median_Coreset(const curve_number_t k, curve_size_t ell, const Curves &in, const distance_t epsilon, const double constant = 1) {
compute(k, ell, in, epsilon, constant);
}

inline void compute(const curve_size_t ell, const Curves &in, const distance_t epsilon, const double eps, const bool round = true, const double constant = 1) {
inline void compute(const curve_number_t k, curve_size_t ell, const Curves &in, const distance_t epsilon, const double eps, const bool round = true, const double constant = 1) {
const auto n = in.size();
const auto m = in.get_m();
auto distances = Clustering::Distance_Matrix(in.size(), in.size());
const auto c_approx = Clustering::arya(1, ell, in, distances, false);
const auto center = c_approx.centers[0];
const auto c_approx = Clustering::kl_median(k, ell, in, distances, false);
const auto centers = c_approx.centers;
cost = c_approx.value;
if (cost == 0) {
std::cerr << "WARNING: cost is zero, coreset construction not possible - check your input" << std::endl;
return;
}
std::vector<double> probabilities(n);
lambda = std::vector<distance_t>(n);

lambda = std::vector<parameter_t>(n);
Lambda = 2*k + 12*std::sqrt(k) + 18;
// to do: remainder
for (curve_number_t i = 0; i < n; ++i) {
lambda[i] = 52.0 / n + 24.0 / cost * Frechet::Continuous::distance(in[i], center).value;
lambda[i] = 52.0 / n + 24.0 / cost * Frechet::Continuous::distance(in[i], centers[0]).value;
probabilities[i] = (lambda[i]) / Lambda;
}

Expand All @@ -56,31 +57,25 @@ class Onemedian_Coreset {
}
}

// inline np::ndarray get_lambda() const {
// np::dtype dt = np::dtype::get_builtin<distance_t>();
// p::list l;
// np::ndarray result = np::array(l, dt);
// for (const auto &elem: lambda) {
// l.append(elem);
// }
// result = np::array(l, dt);
// return result;
// }
inline auto get_lambda() const {
py::list l;
for (const auto &elem : lambda) {
l.append(elem);
}
return py::array_t<parameter_t>(l);
}

inline distance_t get_Lambda() const {
return Lambda;
}

// inline np::ndarray get_curves() const {
// np::dtype dt = np::dtype::get_builtin<curve_number_t>();
// p::list l;
// np::ndarray result = np::array(l, dt);
// for (const auto &elem: coreset) {
// l.append(elem);
// }
// result = np::array(l, dt);
// return result;
// }
inline auto get_curves() const {
py::list l;
for (const auto &elem: coreset) {
l.append(elem);
}
return py::array_t<curve_number_t>(l);
}

inline distance_t get_cost() const {
return cost;
Expand Down
13 changes: 10 additions & 3 deletions include/curve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class Curve : private Points {
return Points::end();
}

inline auto empty() const {
inline bool empty() const {
return Points::empty();
}

Expand Down Expand Up @@ -125,8 +125,7 @@ class Curve : private Points {
for (const Point &elem : *this) {
l.append(elem.as_ndarray());
}
auto result = py::array_t<coordinate_t>(l);
return result;
return py::array_t<coordinate_t>(l);
}

void set_name(const std::string&);
Expand Down Expand Up @@ -175,6 +174,14 @@ class Curves : public std::vector<Curve> {
return dim;
}

inline auto as_ndarray() const {
py::list l;
for (const Curve &elem : *this) {
l.append(elem.as_ndarray());
}
return py::array_t<coordinate_t>(l);
}

Curves simplify(const curve_size_t);

std::string str() const;
Expand Down
6 changes: 2 additions & 4 deletions include/point.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,7 @@ class Point : public Coordinates {
for (const coordinate_t &elem : *this) {
l.append(elem);
}
auto result = py::array_t<coordinate_t>(l);
return result;
return py::array_t<coordinate_t>(l);
}

std::string str() const;
Expand Down Expand Up @@ -223,8 +222,7 @@ class Points : public std::vector<Point> {
for (const Point &elem : *this) {
l.append(elem.as_ndarray());
}
auto result = py::array_t<coordinate_t>(l);
return result;
return py::array_t<coordinate_t>(l);
}

std::string str() const;
Expand Down
1 change: 1 addition & 0 deletions pybind11
Submodule pybind11 added at 4f29b8
13 changes: 7 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,21 +64,22 @@ def build_extension(self, ext):
self.distribution.get_version())
if not os.path.exists(self.build_temp):
os.makedirs(self.build_temp)
subprocess.check_call(['git', 'init'])
subprocess.check_call(['git', 'submodule', 'add', 'https://github.com/pybind/pybind11.git'])
subprocess.check_call(['git', 'submodule', 'update', '--init', '--recursive'])
subprocess.call(['git', 'init'])
subprocess.call(['git', 'submodule', 'add', 'https://github.com/pybind/pybind11.git'])
subprocess.call(['git', 'submodule', 'update', '--init', '--recursive'])
subprocess.check_call(['cmake', "{}".format(ext.sourcedir)] + cmake_args,
cwd=self.build_temp, env=env)
subprocess.check_call(['cmake', '--build', '.'] + build_args,
cwd=self.build_temp)

setup(
name='Fred-Frechet',
version='1.7.2',
version='1.7.3',
author='Dennis Rohde',
author_email='[email protected]',
description='Frechet Distance and Clustering Library',
long_description='A fast, scalable and light-weight C++ Fréchet distance library, exposed to python and focused on (k,l)-clustering of polygonal curves.',
description='A fast, scalable and light-weight C++ Fréchet distance library, exposed to python and focused on (k,l)-clustering of polygonal curves.',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
url="http://fred.dennisrohde.work",
packages=setuptools.find_packages(),
ext_package="Fred",
Expand Down
9 changes: 5 additions & 4 deletions src/fred_python_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@ Clustering::Clustering_Result dtw_one_median_exact(const Curves &in) {
}

Clustering::Clustering_Result klcenter(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, Clustering::Distance_Matrix &distances, const Curves &center_domain = Curves(), const bool random_start_center = true) {
auto result = Clustering::gonzalez(num_centers, ell, in, distances, false, center_domain, random_start_center);
auto result = Clustering::kl_center(num_centers, ell, in, distances, false, center_domain, random_start_center);
return result;
}

Clustering::Clustering_Result klmedian(const curve_number_t num_centers, const curve_size_t ell, const Curves &in, Clustering::Distance_Matrix distances, const Curves &center_domain = Curves()) {

auto result = Clustering::arya(num_centers, ell, in, distances, center_domain);
auto result = Clustering::kl_median(num_centers, ell, in, distances, center_domain);

return result;
}
Expand Down Expand Up @@ -140,7 +140,7 @@ PYBIND11_MODULE(backend, m) {
.def("__str__", &Point::str)
.def("__iter__", [](Point &v) { return py::make_iterator(v.begin(), v.end()); }, py::keep_alive<0, 1>())
.def("__repr__", &Point::repr)
//.def_property_readonly("values", &Point::as_ndarray)
.def_property_readonly("values", &Point::as_ndarray)
;

py::class_<Points>(m, "Points")
Expand Down Expand Up @@ -179,6 +179,7 @@ PYBIND11_MODULE(backend, m) {
.def("__str__", &Curves::str)
.def("__iter__", [](Curves &v) { return py::make_iterator(v.begin(), v.end()); }, py::keep_alive<0, 1>())
.def("__repr__", &Curves::repr)
.def_property_readonly("values", &Curves::as_ndarray)
;

py::class_<fc::Distance>(m, "Continuous_Frechet_Distance")
Expand All @@ -197,7 +198,7 @@ PYBIND11_MODULE(backend, m) {
.def("__repr__", &fd::Distance::repr)
;

py::class_<ddtw::Distance>(m, "Discrete_Dynamic_Time_Warping_Distance")
py::class_<ddtw::Distance>(m, "Discrete_Dynamic_Time_Warping_Distance_Result")
.def(py::init<>())
.def_readwrite("time", &ddtw::Distance::time)
.def_readwrite("value", &ddtw::Distance::value)
Expand Down

0 comments on commit 1103cc2

Please sign in to comment.