Skip to content
Snippets Groups Projects
Commit 13fc6a10 authored by Boris Stefanovic's avatar Boris Stefanovic
Browse files

cleanup

parent ab4e705b
Branches
No related tags found
No related merge requests found
...@@ -55,5 +55,8 @@ clean: ...@@ -55,5 +55,8 @@ clean:
debug: ${DEBUG_TARGET} debug: ${DEBUG_TARGET}
./$< -i test/data.txt ./$< -i test/data.txt
test: ${TARGET}
./$< -i test/data.txt -o ~/test_kmeans
exec: ${TARGET} exec: ${TARGET}
./$< ./$<
...@@ -52,8 +52,37 @@ void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) { ...@@ -52,8 +52,37 @@ void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) {
bool cluster_update_center_int(cluster_int_t* cluster) { bool cluster_update_center_int(cluster_int_t* cluster) {
//TODO // save old center
return true; vector_int_t* old_center = cluster->center;
assert(old_center != NULL);
assert(cluster != NULL);
// create new center
list_points_node_int_t* node = cluster->points->head;
// if cluster is empty
if (NULL == node) {
return false; // center has not been changed
} else {
cluster->center = vector_create_int(node->point->dim);
// sum all values in center
while (node != NULL) {
vector_add_inplace_int(cluster->center, *(node->point));
node = node->next;
}
// divide by number of points
vector_div_inplace_int(cluster->center, (int_t) cluster->points->size);
// check whether center has changed
bool changed = false;
for (size_t p = 0; p < cluster->center->dim; ++p) {
if (cluster->center->data[p] == old_center->data[p]) {
changed = true;
break;
}
}
// destroy old center
vector_destroy_int(old_center);
// return true if center has changed
return changed;
}
} }
bool cluster_update_center_fpt(cluster_fpt_t* cluster) { bool cluster_update_center_fpt(cluster_fpt_t* cluster) {
......
#define _GNU_SOURCE #define _GNU_SOURCE
#include "io.h" #include "io.h"
#include <assert.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
...@@ -9,17 +10,21 @@ ...@@ -9,17 +10,21 @@
int_t io_read_int(FILE* file) { int_t io_read_int(FILE* file) {
char* line; char* line = NULL;
size_t len; size_t len = 0;
getline(&line, &len, file); getline(&line, &len, file);
return strtol(line, NULL, 10); long res = strtol(line, NULL, 10);
free(line);
return res;
} }
fpt_t io_read_fpt(FILE* file) { fpt_t io_read_fpt(FILE* file) {
char* line; char* line = NULL;
size_t len; size_t len = 0;
getline(&line, &len, file); getline(&line, &len, file);
return strtod(line, NULL); double res = strtod(line, NULL);
free(line);
return res;
} }
......
...@@ -3,11 +3,49 @@ ...@@ -3,11 +3,49 @@
#include "vector.h" #include "vector.h"
cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) { bool is_vector_in_centers_int(const vector_int_t* center, const cluster_int_t** clusters, const size_t i) {
//TODO for (size_t k = 0; k < i; ++k) {
return NULL; if (vector_equals_int(clusters[k]->center, center)) {
return true;
}
}
return false;
} }
cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) {
// check args and init
if (NULL == points || point_count < 2 || nclusters < 2) return NULL;
cluster_int_t** clusters = calloc(nclusters, sizeof(vector_int_t*));
if (NULL == clusters) return NULL;
for (size_t k = 0; k < nclusters; ++k) {
clusters[k] = cluster_create_int(NULL);
}
// determine range in which we are working
vector_int_t* min = vector_copy_int(points[0]);
vector_int_t* max = vector_copy_int(points[0]);
for (size_t i = 1; i < point_count; ++i) {
for (size_t p = 0; p < max->dim; ++p) {
const int_t value = points[i]->data[p];
if (value < min->data[p]) min->data[p] = value;
if (value > max->data[p]) max->data[p] = value;
}
}
// until we have enough centers
for (size_t i = 0; i < nclusters; ++i) {
vector_int_t* center = vector_create_int(max->dim);
bool valid = false;
while (!valid) {
// initialise center values randomly, within the "multidimensional rectangle" of our set of points
for (size_t p = 0; p < center->dim; ++p) {
center->data[p] = rand_int_range(min->data[p], max->data[p]); // TODO: create a rand_long_range(...) function
}
// check center is not already in clusters, although probability is extremely low...
valid = !is_vector_in_centers_int(center, (const cluster_int_t**) clusters, i);
}
clusters[i] = cluster_create_int(center);
}
return clusters;
}
bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t** clusters, const size_t i) { bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t** clusters, const size_t i) {
for (size_t k = 0; k < i; ++k) { for (size_t k = 0; k < i; ++k) {
...@@ -18,7 +56,6 @@ bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t** ...@@ -18,7 +56,6 @@ bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t**
return false; return false;
} }
cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) { cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) {
// check args and init // check args and init
if (NULL == points || point_count < 2 || nclusters < 2) return NULL; if (NULL == points || point_count < 2 || nclusters < 2) return NULL;
...@@ -55,9 +92,57 @@ cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size ...@@ -55,9 +92,57 @@ cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size
} }
void kmeans_destroy_clusters_int(cluster_int_t** clusters, const size_t nb_clusters) {
if (NULL == clusters) return;
for (size_t i = 0; i < nb_clusters; ++i) {
cluster_destroy_int(clusters[i]);
}
}
void kmeans_destroy_clusters_fpt(cluster_fpt_t** clusters, const size_t nb_clusters) {
if (NULL == clusters) return;
for (size_t i = 0; i < nb_clusters; ++i) {
cluster_destroy_fpt(clusters[i]);
}
}
void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters,
fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) { fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) {
//TODO bool changed = true;
while (changed) {
// reset condition
changed = false;
// empty all clusters, keeping only their centers (virtual)
for (size_t k = 0; k < nb_clusters; ++k) {
cluster_reset_int(clusters[k]);
}
// for each point
for (size_t i = 0; i < point_count; ++i) {
vector_int_t* point = points[i];
// find closest cluster
cluster_int_t* cmin = clusters[0];
fpt_t dmin = distance_function(point, cmin->center);
for (size_t k = 0; k < nb_clusters; ++k) {
cluster_int_t* current_cluster = clusters[k];
fpt_t dist = distance_function(point, current_cluster->center);
if (dist < dmin) {
cmin = current_cluster;
dmin = dist;
}
}
// add point to closest cluster
cluster_add_point_int(cmin, point);
}
// update all cluster centers
for (size_t k = 0; k < nb_clusters; ++k) {
assert(clusters[k] != NULL);
assert(clusters[k]->points != NULL);
if (cluster_update_center_int(clusters[k])) {
changed = true;
}
}
}
} }
...@@ -94,7 +179,6 @@ void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** ...@@ -94,7 +179,6 @@ void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t**
assert(clusters[k]->points != NULL); assert(clusters[k]->points != NULL);
if (cluster_update_center_fpt(clusters[k])) { if (cluster_update_center_fpt(clusters[k])) {
changed = true; changed = true;
printf("%lud \n<%lf %lf %lf>\n\n", nb_clusters, clusters[k]->center->data[0], clusters[k]->center->data[1], clusters[k]->center->data[2]);
} }
} }
} }
......
...@@ -10,6 +10,11 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size ...@@ -10,6 +10,11 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size
cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters); cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters);
void kmeans_destroy_clusters_int(cluster_int_t** clusters, const size_t nb_clusters);
void kmeans_destroy_clusters_fpt(cluster_fpt_t** clusters, const size_t nb_clusters);
void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)); void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*));
void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)); void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*));
......
#include <assert.h>
#include <getopt.h> #include <getopt.h>
#include <stdbool.h> #include <stdbool.h>
#include <stdio.h> #include <stdio.h>
...@@ -70,45 +71,59 @@ void parse_args(int argc, char** argv, char** ipath, char** opath, enum Distance ...@@ -70,45 +71,59 @@ void parse_args(int argc, char** argv, char** ipath, char** opath, enum Distance
} }
int main_int(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) { int main_int(FILE* ifile, FILE* ofile, const size_t dim, const size_t nb_clusters, const enum DistanceFunctionType dist_func_type) {
//TODO // INIT
return EXIT_FAILURE; vector_int_t** points = NULL;
cluster_int_t** clusters = NULL;
// READ
list_points_int_t* list = io_get_vector_list_int(ifile, dim);
const size_t point_count = list->size;
points = list_points_to_array_int(list);
list_points_destroy_int(list, false);
list = NULL;
// ALGORITHM
printf("INIT: ");
clusters = kmeans_init_clusters_int((const vector_int_t**) points, point_count, nb_clusters);
printf("DONE\n");
printf("KMEANS: begin\n");
kmeans_int(points, point_count, clusters, nb_clusters, DIST_FUNC_INT[dist_func_type]);
printf("KMEANS: DONE !\n");
// WRITE
fprintf(ofile, "%lu\n%lu\n", dim, nb_clusters);
io_write_clusters_to_file_int(ofile, clusters, nb_clusters);
// CLEANUP
for (size_t i = 0; i < nb_clusters; ++i) cluster_destroy_int(clusters[i]);
for (size_t i = 0; i < point_count; ++i) vector_destroy_int(points[i]);
// EXIT
return EXIT_SUCCESS;
} }
int main_fpt(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) { int main_fpt(FILE* ifile, FILE* ofile, const size_t dim, const size_t nb_clusters, const enum DistanceFunctionType dist_func_type) {
// INIT
vector_fpt_t** points = NULL;
cluster_fpt_t** clusters = NULL;
// READ // READ
FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin;
const size_t dim = io_read_int(ifile);
const size_t nb_clusters = io_read_int(ifile);
if (0 == dim) {
printf("DIMENSION MUST BE STRICTLY POSITIVE !\n");
fclose(ifile);
return EXIT_FAILURE;
}
if (0 == nb_clusters) {
printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n");
fclose(ifile);
return EXIT_FAILURE;
}
list_points_fpt_t* list = io_get_vector_list_fpt(ifile, dim); list_points_fpt_t* list = io_get_vector_list_fpt(ifile, dim);
fclose(ifile);
ifile = NULL;
const size_t point_count = list->size; const size_t point_count = list->size;
vector_fpt_t** points = list_points_to_array_fpt(list); points = list_points_to_array_fpt(list);
list_points_destroy_fpt(list, false); list_points_destroy_fpt(list, false);
list = NULL; list = NULL;
// ALGORITHM // ALGORITHM
printf("INIT: ... "); printf("INIT: ");
cluster_fpt_t** clusters = kmeans_init_clusters_fpt((const vector_fpt_t**) points, point_count, nb_clusters); clusters = kmeans_init_clusters_fpt((const vector_fpt_t**) points, point_count, nb_clusters);
printf("DONE\n"); printf("DONE\n");
printf("STARTING KMEANS ALGORITHM: ...\n"); printf("KMEANS: begin\n");
kmeans_fpt(points, point_count, clusters, nb_clusters, DIST_FUNC_FPT[dist_func_type]); kmeans_fpt(points, point_count, clusters, nb_clusters, DIST_FUNC_FPT[dist_func_type]);
printf("KMEANS DONE !\n"); printf("KMEANS: DONE !\n");
// WRITE // WRITE
FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout; fprintf(ofile, "%lu\n%lu\n", dim, nb_clusters);
fprintf(ofile, "%lud\n%lud\n", dim, nb_clusters);
io_write_clusters_to_file_fpt(ofile, clusters, nb_clusters); io_write_clusters_to_file_fpt(ofile, clusters, nb_clusters);
fclose(ofile); // CLEANUP
for (size_t i = 0; i < nb_clusters; ++i) cluster_destroy_fpt(clusters[i]);
free(clusters);
for (size_t i = 0; i < point_count; ++i) vector_destroy_fpt(points[i]);
free(points);
// EXIT
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
...@@ -120,14 +135,34 @@ int main(int argc, char** argv) { ...@@ -120,14 +135,34 @@ int main(int argc, char** argv) {
enum DataType datatype = FLOAT; enum DataType datatype = FLOAT;
// parse args // parse args
parse_args(argc, argv, &ipath, &opath, &disttype, &datatype); parse_args(argc, argv, &ipath, &opath, &disttype, &datatype);
switch (datatype) { // open files
case FLOAT: FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin;
printf("FLOAT\n"); FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout;
return main_fpt(ipath, opath, disttype); // read dimension and desired number of clusters from file
case INT: const size_t dim = io_read_int(ifile);
printf("INT\n"); const size_t nb_clusters = io_read_int(ifile);
return main_int(ipath, opath, disttype); if (0 == dim) {
default: printf("DIMENSION MUST BE STRICTLY POSITIVE !\n");
abort(); fclose(ifile);
fclose(ofile);
return EXIT_FAILURE;
} }
if (0 == nb_clusters) {
printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n");
fclose(ifile);
fclose(ofile);
return EXIT_FAILURE;
}
// type specific code
int return_value = EXIT_FAILURE;
int (* main_routine)(FILE*, FILE*, const size_t, const size_t, const enum DistanceFunctionType);
main_routine = INT == datatype ? main_int : main_fpt;
printf(INT == datatype ? "TYPE: INT\n" : "TYPE: FLOAT\n");
assert(ifile != NULL);
assert(ofile != NULL);
return_value = main_routine(ifile, ofile, dim, nb_clusters, disttype);
// cleanup
fclose(ifile);
fclose(ofile);
return return_value;
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment