Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • features
  • increment
  • main
3 results

Target

Select target project
  • boris.stefanov/prog_kmeans
1 result
Select Git revision
Show changes
Commits on Source (23)
.idea
*.o
build
*.pdf
doc/*.pdf
......@@ -7,14 +7,14 @@ HDR := $(wildcard ${SRC_DIR}/*.h)
BUILD_DIR := ${BUILD_ROOT}/prod
TARGET := ${BUILD_DIR}/main
CFLAGS := -std=c11 -Wall -Wextra -pedantic
LDEXTRA :=
LDEXTRA := -lm
LDFLAGS := ${CFLAGS} ${LDEXTRA}
OBJ := $(patsubst ${SRC_DIR}/%.c,${BUILD_DIR}/%.o,${SRC})
DEBUG_BUILD_DIR := ${BUILD_ROOT}/debug
DEBUG_TARGET := ${DEBUG_BUILD_DIR}/debug
DEBUG_CFLAGS := ${CFLAGS} -fsanitize=address -fsanitize=leak -g -DDEBUG
DEBUG_LDEXTRA :=
DEBUG_LDEXTRA := ${LDEXTRA}
DEBUG_LDFLAGS := ${DEBUG_CFLAGS} ${DEBUG_LDEXTRA}
DEBUG_OBJ := $(patsubst ${SRC_DIR}/%.c,${DEBUG_BUILD_DIR}/%.o,${SRC})
......@@ -22,18 +22,18 @@ DEBUG_OBJ := $(patsubst ${SRC_DIR}/%.c,${DEBUG_BUILD_DIR}/%.o,${SRC})
# TARGETS
all: ${TARGET} ${TARGET_DEBUG}
all: ${TARGET} ${DEBUG_TARGET}
${TARGET}: ${OBJ}
${CC} ${LDFLAGS} -o $@ $^
${OBJ}: ${BUILD_DIR}/%.o: %.c ${HDR} ${BUILD_DIR}
${OBJ}: ${BUILD_DIR}/%.o: ${SRC_DIR}/%.c ${HDR} ${BUILD_DIR}
${CC} ${CFLAGS} -c -o $@ $<
${DEBUG_TARGET}: ${DEBUG_OBJ}
${CC} ${DEBUG_LDFLAGS} -o $@ $^
${DEBUG_OBJ}: ${DEBUG_BUILD_DIR}/%.o: %.c ${HDR} ${DEBUG_BUILD_DIR}
${DEBUG_OBJ}: ${DEBUG_BUILD_DIR}/%.o: ${SRC_DIR}/%.c ${HDR} ${DEBUG_BUILD_DIR}
${CC} ${DEBUG_CFLAGS} -c -o $@ $<
......@@ -53,7 +53,10 @@ clean:
rm -rf ${BUILD_ROOT}
debug: ${DEBUG_TARGET}
./$<
./$< -i test/data.txt
test: ${TARGET}
./$< -i test/data.txt -o ~/test_kmeans
exec: ${TARGET}
./$<
......@@ -7,7 +7,7 @@ read: ${PDF}
firefox $^
%.pdf: %.md Makefile
pandoc --pdf-engine=xelatex -o $@ $<
pandoc --pdf-engine=lualatex -t beamer -o $@ $<
clean:
rm -rf ${PDF}
......
......@@ -2,21 +2,94 @@
title: K-Means - Une Implémentation
author: Boris Stefanovic
date: 2022-05-24
theme: "Frankfurt"
geometry: "margin=40mm"
mainfont: DejaVu Sans
header-includes:
- \usepackage{float}
- \let\origfigure\figure
- \let\endorigfigure\endfigure
- \renewenvironment{figure}[1][2] {\expandafter\origfigure\expandafter[H]} {\endorigfigure}
---
\newpage
# Structures de Données
## Vecteur
- chaque point est un vecteur
- types entiers et virgule flottante séparés
- "common.h" contient les définitions de `int_t` et `fpt_t`
We can justify the implementation of vectors for multiple types to ease
application to several scenarios without the need for casting,
e.g. scientific measurements (floating point) and image data (integer).
```c
typedef struct vector_int {
size_t dim;
int_t* data;
} vector_int_t;
typedef struct vector_fpt {
size_t dim;
fpt_t* data;
} vector_fpt_t;
```
## Faciliter l'Association à un Cluster
- structure vecteur vue précédemment générale
- on veut associer chaque point à un cluster
- le cluster auquel chaque point appartient change au cours de l'algorithme
- stocker les points "dans" des structures "cluster" est peu judicieux
- on stocke un identifiant de cluster (un pointeur) dans une structure "point de cluster"
## Cluster
- un cluster peut être représenté par
- un identifiant: un pointeur, forcémment unique
- son centre: un point virtuel, la valeur derrière le pointeur
# Décisions
```c
typedef vector_int_t* cluster_int_t;
```
## Point de Cluster
```c
typedef struct point_int {
vector_int_t* vector;
vector_int_t* cluster;
} point_int_t;
```
On justifie la présence d'une adresse de centre dans la struct par
le fait qu'il y ait une relation qui à chaque centre associe probablement
plusieurs points.
## Ensemble de Points
- parcours répétés de l'ensemble de tous les points
- pas d'ordre particulier (sauf à l'initialisation des centroïdes)
- une liste simplement chaînée fera l'affaire
```c
typedef struct list_points_node_int {
const point_int_t* point;
struct list_points_node_int* next;
} list_points_node_int_t;
typedef struct list_points_int {
list_points_node_int_t* head;
list_points_node_int_t* tail;
size_t size;
} list_points_int_t;
```
# Algorithmique
# Implémentation
# Démonstration
File added
#include "cluster.h"
#include <assert.h>
#include <math.h>
#include <stdbool.h>
#include "vector.h"
#define EPSILON 0.001
cluster_int_t* cluster_create_int(vector_int_t* center) {
cluster_int_t* cluster = malloc(sizeof(cluster_int_t));
if (NULL == cluster) return NULL;
cluster->center = center;
cluster->points = list_points_create_int();
return cluster;
}
cluster_fpt_t* cluster_create_fpt(vector_fpt_t* center) {
cluster_fpt_t* cluster = malloc(sizeof(cluster_fpt_t));
if (NULL == cluster) return NULL;
cluster->center = center;
cluster->points = list_points_create_fpt();
return cluster;
}
void cluster_destroy_int(cluster_int_t* cluster) {
if (NULL == cluster) return;
vector_destroy_int(cluster->center);
list_points_destroy_int(cluster->points, false);
free(cluster);
}
void cluster_destroy_fpt(cluster_fpt_t* cluster) {
if (NULL == cluster) return;
vector_destroy_fpt(cluster->center);
list_points_destroy_fpt(cluster->points, false);
free(cluster);
}
void cluster_add_point_int(cluster_int_t* cluster, vector_int_t* point) {
if (NULL == cluster || NULL == point) return;
list_points_append_int(cluster->points, point);
}
void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) {
if (NULL == cluster || NULL == point) return;
list_points_append_fpt(cluster->points, point);
}
bool cluster_update_center_int(cluster_int_t* cluster) {
// save old center
vector_int_t* old_center = cluster->center;
assert(old_center != NULL);
assert(cluster != NULL);
// create new center
list_points_node_int_t* node = cluster->points->head;
// if cluster is empty
if (NULL == node) {
return false; // center has not been changed
} else {
cluster->center = vector_create_int(node->point->dim);
// sum all values in center
while (node != NULL) {
vector_add_inplace_int(cluster->center, *(node->point));
node = node->next;
}
// divide by number of points
vector_div_inplace_int(cluster->center, (int_t) cluster->points->size);
// check whether center has changed
bool changed = false;
for (size_t p = 0; p < cluster->center->dim; ++p) {
if (cluster->center->data[p] == old_center->data[p]) {
changed = true;
break;
}
}
// destroy old center
vector_destroy_int(old_center);
// return true if center has changed
return changed;
}
}
bool cluster_update_center_fpt(cluster_fpt_t* cluster) {
// save old center
vector_fpt_t* old_center = cluster->center;
assert(old_center != NULL);
assert(cluster != NULL);
// create new center
list_points_node_fpt_t* node = cluster->points->head;
// if cluster is empty
if (NULL == node) {
return false; // center has not been changed
} else {
cluster->center = vector_create_fpt(node->point->dim);
// sum all values in center
while (node != NULL) {
vector_add_inplace_fpt(cluster->center, *(node->point));
node = node->next;
}
// divide by number of points
vector_div_inplace_fpt(cluster->center, (fpt_t) cluster->points->size);
// check whether center has changed
bool changed = false;
for (size_t p = 0; p < cluster->center->dim; ++p) {
if (fabs(cluster->center->data[p] - old_center->data[p]) < EPSILON) {
changed = true;
break;
}
}
// destroy old center
vector_destroy_fpt(old_center);
// return true if center has changed
return changed;
}
}
void cluster_reset_int(cluster_int_t* cluster) {
list_points_destroy_int(cluster->points, false);
cluster->points = list_points_create_int();
}
void cluster_reset_fpt(cluster_fpt_t* cluster) {
list_points_destroy_fpt(cluster->points, false);
cluster->points = list_points_create_fpt();
}
#ifndef PROG_KMEANS_CLUSTER_H
#define PROG_KMEANS_CLUSTER_H
#include <stdbool.h>
#include <stdlib.h>
#include "linkedlist.h"
#include "vector.h"
typedef struct cluster_int {
vector_int_t* center;
list_points_int_t* points;
} cluster_int_t;
typedef struct cluster_fpt {
vector_fpt_t* center;
list_points_fpt_t* points;
} cluster_fpt_t;
cluster_int_t* cluster_create_int(vector_int_t* center);
cluster_fpt_t* cluster_create_fpt(vector_fpt_t* center);
void cluster_destroy_int(cluster_int_t* center);
void cluster_destroy_fpt(cluster_fpt_t* center);
void cluster_add_point_int(cluster_int_t* cluster, vector_int_t* point);
void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point);
bool cluster_update_center_int(cluster_int_t* cluster);
bool cluster_update_center_fpt(cluster_fpt_t* cluster);
void cluster_reset_int(cluster_int_t* cluster);
void cluster_reset_fpt(cluster_fpt_t* cluster);
#endif //PROG_KMEANS_CLUSTER_H
#include "common.h"
#include <stdbool.h>
#include <stdlib.h>
#include <time.h>
bool randinit = false;
void init_rand() {
srand(time(NULL));
randinit = true;
}
int rand_int(int max) {
if (!randinit) init_rand();
return rand() % max;
}
int rand_int_range(int min, int max) {
if (min > max) {
int swap = min;
min = max;
max = swap;
}
return min + rand_int(max - min);
}
double rand_double_range_one() {
if (!randinit) init_rand();
return ((double) rand()) / ((double) RAND_MAX);
}
double rand_double_range(double min, double max) {
if (min > max) {
double swap = min;
min = max;
max = swap;
}
return min + rand_double_range_one() * (max - min);
}
#ifndef PROG_KMEANS_COMMON_H
#define PROG_KMEANS_COMMON_H
#include <stdint.h>
typedef int64_t int_t;
typedef double fpt_t;
void init_rand();
int rand_int(int max);
int rand_int_range(int min, int max);
double rand_double_range_one();
double rand_double_range(double min, double max);
#endif //PROG_KMEANS_COMMON_H
#include "distance.h"
#include <math.h>
#include "common.h"
#define ERROR -1.0
int_t abs_diff_int(const int_t a1, const int_t a2) {
int_t diff = a2 - a1;
return diff >= 0 ? diff : -diff;
}
fpt_t abs_diff_fpt(const fpt_t a1, const fpt_t a2) {
fpt_t diff = a2 - a1;
return diff >= 0.0 ? diff : -diff;
}
fpt_t distance_euclid_int(const vector_int_t* p1, const vector_int_t* p2) {
if (p1->dim != p2->dim) return ERROR;
int_t acc = 0;
for (size_t i = 0; i < p1->dim; ++i) {
int_t diff = p2->data[i] - p1->data[i];
int_t item = diff * diff;
acc += item;
}
return sqrt((fpt_t) acc);
}
fpt_t distance_euclid_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2) {
if (p1->dim != p2->dim) return ERROR;
fpt_t acc = 0;
for (size_t i = 0; i < p1->dim; ++i) {
fpt_t diff = p2->data[i] - p1->data[i];
fpt_t item = diff * diff;
acc += item;
}
return sqrt((fpt_t) acc);
}
fpt_t distance_manhattan_int(const vector_int_t* p1, const vector_int_t* p2) {
if (p1->dim != p2->dim) return ERROR;
int_t acc = 0;
for (size_t i = 0; i < p1->dim; ++i) {
int_t diff = p2->data[i] - p1->data[i];
int_t item = diff >= 0 ? diff : -diff;
acc += item;
}
return (fpt_t) acc;
}
fpt_t distance_manhattan_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2) {
if (p1->dim != p2->dim) return ERROR;
fpt_t acc = 0;
for (size_t i = 0; i < p1->dim; ++i) {
fpt_t diff = p2->data[i] - p1->data[i];
fpt_t item = diff >= 0 ? diff : -diff;
acc += item;
}
return (fpt_t) acc;
}
fpt_t distance_chebyshev_int(const vector_int_t* p1, const vector_int_t* p2) {
if (p1->dim != p2->dim) return ERROR;
int_t max = ERROR;
int_t item;
for (size_t i = 0; i < p1->dim; ++i) {
item = abs_diff_int(p1->data[i], p2->data[i]);
if (item > max) max = item;
}
return (fpt_t) max;
}
fpt_t distance_chebyshev_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2) {
if (p1->dim != p2->dim) return ERROR;
fpt_t max = ERROR;
fpt_t item;
for (size_t i = 0; i < p1->dim; ++i) {
item = abs_diff_fpt(p1->data[i], p2->data[i]);
if (item > max) max = item;
}
return (fpt_t) max;
}
#ifndef PROG_KMEANS_DISTANCE_H
#define PROG_KMEANS_DISTANCE_H
#include "vector.h"
int_t abs_diff_int(int_t a1, int_t a2);
fpt_t abs_diff_fpt(fpt_t a1, fpt_t a2);
fpt_t distance_euclid_int(const vector_int_t* p1, const vector_int_t* p2);
fpt_t distance_euclid_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2);
fpt_t distance_manhattan_int(const vector_int_t* p1, const vector_int_t* p2);
fpt_t distance_manhattan_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2);
fpt_t distance_chebyshev_int(const vector_int_t* p1, const vector_int_t* p2);
fpt_t distance_chebyshev_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2);
#endif //PROG_KMEANS_DISTANCE_H
#define _GNU_SOURCE
#include "io.h"
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "linkedlist.h"
#include "vector.h"
int_t io_read_int(FILE* file) {
char* line = NULL;
size_t len = 0;
getline(&line, &len, file);
long res = strtol(line, NULL, 10);
free(line);
return res;
}
fpt_t io_read_fpt(FILE* file) {
char* line = NULL;
size_t len = 0;
getline(&line, &len, file);
double res = strtod(line, NULL);
free(line);
return res;
}
vector_int_t* io_line_to_vector_int(char* line, const size_t dim) {
vector_int_t* vector = vector_create_int(dim);
char* tgt = line;
char* token = NULL;
for (size_t i = 0; i < dim; ++i, tgt = NULL) {
token = strtok(tgt, ",");
// strtol returns 0 if number not read, which is the desired behaviour:
vector->data[i] = token != NULL ? strtol(token, NULL, 10) : 0;
}
return vector;
}
vector_fpt_t* io_line_to_vector_fpt(char* line, const size_t dim) {
vector_fpt_t* vector = vector_create_fpt(dim);
char* tgt = line;
char* token = NULL;
for (size_t i = 0; i < dim; ++i, tgt = NULL) {
token = strtok(tgt, ",");
// strtol returns 0 if number not read, which is the desired behaviour:
vector->data[i] = token != NULL ? strtod(token, NULL) : 0;
}
return vector;
}
list_points_int_t* io_get_vector_list_int(FILE* ifile, const size_t dim) {
list_points_int_t* list = list_points_create_int();
char* line = NULL;
size_t len = 0;
while (getline(&line, &len, ifile) != -1) {
if (len != 0) {
vector_int_t* vector = io_line_to_vector_int(line, dim);
list_points_append_int(list, vector);
//free(line);
}
}
free(line);
return list;
}
list_points_fpt_t* io_get_vector_list_fpt(FILE* ifile, const size_t dim) {
list_points_fpt_t* list = list_points_create_fpt();
char* line = NULL;
size_t len = 0;
while (getline(&line, &len, ifile) != -1) {
if (len != 0) {
vector_fpt_t* vector = io_line_to_vector_fpt(line, dim);
list_points_append_fpt(list, vector);
//free(line);
}
}
free(line);
return list;
}
void io_write_clusters_to_file_int(FILE* file, cluster_int_t** clusters, const size_t cluster_count) {
for (size_t i = 0; i < cluster_count; ++i) {
fprintf(file, "\n*\n");
list_points_node_int_t* node = clusters[i]->points->head;
while (node != NULL) {
const vector_int_t point = *(node->point);
fprintf(file, "%ld", point.data[0]);
for (size_t p = 1; p < point.dim; ++p) {
fprintf(file, " , %ld", point.data[p]);
}
fprintf(file, "\n");
node = node->next;
}
}
}
void io_write_clusters_to_file_fpt(FILE* file, cluster_fpt_t** clusters, const size_t cluster_count) {
for (size_t i = 0; i < cluster_count; ++i) {
fprintf(file, "\n*\n");
list_points_node_fpt_t* node = clusters[i]->points->head;
while (node != NULL) {
const vector_fpt_t point = *(node->point);
fprintf(file, "%lf", point.data[0]);
for (size_t p = 1; p < point.dim; ++p) {
fprintf(file, " , %lf", point.data[p]);
}
fprintf(file, "\n");
node = node->next;
}
}
}
#ifndef PROG_KMEANS_IO_H
#define PROG_KMEANS_IO_H
#include <stdio.h>
#include "cluster.h"
#include "common.h"
#include "linkedlist.h"
#include "vector.h"
int_t io_read_int(FILE* file);
fpt_t io_read_fpt(FILE* file);
vector_int_t* io_line_to_vector_int(char* line, const size_t dim);
vector_fpt_t* io_line_to_vector_fpt(char* line, const size_t dim);
list_points_int_t* io_get_vector_list_int(FILE* ifile, const size_t dim);
list_points_fpt_t* io_get_vector_list_fpt(FILE* ifile, const size_t dim);
void io_write_clusters_to_file_int(FILE* file, cluster_int_t** clusters, const size_t cluster_count);
void io_write_clusters_to_file_fpt(FILE* file, cluster_fpt_t** clusters, const size_t cluster_count);
#endif //PROG_KMEANS_IO_H
#include "kmeans.h"
#include <assert.h>
#include "vector.h"
bool is_vector_in_centers_int(const vector_int_t* center, const cluster_int_t** clusters, const size_t i) {
for (size_t k = 0; k < i; ++k) {
if (vector_equals_int(clusters[k]->center, center)) {
return true;
}
}
return false;
}
cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) {
// check args and init
if (NULL == points || point_count < 2 || nclusters < 2) return NULL;
cluster_int_t** clusters = calloc(nclusters, sizeof(vector_int_t*));
if (NULL == clusters) return NULL;
for (size_t k = 0; k < nclusters; ++k) {
clusters[k] = cluster_create_int(NULL);
}
// determine range in which we are working
vector_int_t* min = vector_copy_int(points[0]);
vector_int_t* max = vector_copy_int(points[0]);
for (size_t i = 1; i < point_count; ++i) {
for (size_t p = 0; p < max->dim; ++p) {
const int_t value = points[i]->data[p];
if (value < min->data[p]) min->data[p] = value;
if (value > max->data[p]) max->data[p] = value;
}
}
// until we have enough centers
for (size_t i = 0; i < nclusters; ++i) {
vector_int_t* center = vector_create_int(max->dim);
bool valid = false;
while (!valid) {
// initialise center values randomly, within the "multidimensional rectangle" of our set of points
for (size_t p = 0; p < center->dim; ++p) {
center->data[p] = rand_int_range(min->data[p], max->data[p]); // TODO: create a rand_long_range(...) function
}
// check center is not already in clusters, although probability is extremely low...
valid = !is_vector_in_centers_int(center, (const cluster_int_t**) clusters, i);
}
clusters[i] = cluster_create_int(center);
}
return clusters;
}
bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t** clusters, const size_t i) {
for (size_t k = 0; k < i; ++k) {
if (vector_equals_fpt(clusters[k]->center, center)) {
return true;
}
}
return false;
}
cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) {
// check args and init
if (NULL == points || point_count < 2 || nclusters < 2) return NULL;
cluster_fpt_t** clusters = calloc(nclusters, sizeof(vector_fpt_t*));
if (NULL == clusters) return NULL;
for (size_t k = 0; k < nclusters; ++k) {
clusters[k] = cluster_create_fpt(NULL);
}
// determine range in which we are working
vector_fpt_t* min = vector_copy_fpt(points[0]);
vector_fpt_t* max = vector_copy_fpt(points[0]);
for (size_t i = 1; i < point_count; ++i) {
for (size_t p = 0; p < max->dim; ++p) {
const fpt_t value = points[i]->data[p];
if (value < min->data[p]) min->data[p] = value;
if (value > max->data[p]) max->data[p] = value;
}
}
// until we have enough centers
for (size_t i = 0; i < nclusters; ++i) {
vector_fpt_t* center = vector_create_fpt(max->dim);
bool valid = false;
while (!valid) {
// initialise center values randomly, within the "multidimensional rectangle" of our set of points
for (size_t p = 0; p < center->dim; ++p) {
center->data[p] = rand_double_range(min->data[p], max->data[p]);
}
// check center is not already in clusters, although probability is extremely low...
valid = !is_vector_in_centers_fpt(center, (const cluster_fpt_t**) clusters, i);
}
clusters[i] = cluster_create_fpt(center);
}
return clusters;
}
void kmeans_destroy_clusters_int(cluster_int_t** clusters, const size_t nb_clusters) {
if (NULL == clusters) return;
for (size_t i = 0; i < nb_clusters; ++i) {
cluster_destroy_int(clusters[i]);
}
}
void kmeans_destroy_clusters_fpt(cluster_fpt_t** clusters, const size_t nb_clusters) {
if (NULL == clusters) return;
for (size_t i = 0; i < nb_clusters; ++i) {
cluster_destroy_fpt(clusters[i]);
}
}
void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters,
fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) {
bool changed = true;
while (changed) {
// reset condition
changed = false;
// empty all clusters, keeping only their centers (virtual)
for (size_t k = 0; k < nb_clusters; ++k) {
cluster_reset_int(clusters[k]);
}
// for each point
for (size_t i = 0; i < point_count; ++i) {
vector_int_t* point = points[i];
// find closest cluster
cluster_int_t* cmin = clusters[0];
fpt_t dmin = distance_function(point, cmin->center);
for (size_t k = 0; k < nb_clusters; ++k) {
cluster_int_t* current_cluster = clusters[k];
fpt_t dist = distance_function(point, current_cluster->center);
if (dist < dmin) {
cmin = current_cluster;
dmin = dist;
}
}
// add point to closest cluster
cluster_add_point_int(cmin, point);
}
// update all cluster centers
for (size_t k = 0; k < nb_clusters; ++k) {
assert(clusters[k] != NULL);
assert(clusters[k]->points != NULL);
if (cluster_update_center_int(clusters[k])) {
changed = true;
}
}
}
}
void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters,
fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)) {
bool changed = true;
while (changed) {
// reset condition
changed = false;
// empty all clusters, keeping only their centers (virtual)
for (size_t k = 0; k < nb_clusters; ++k) {
cluster_reset_fpt(clusters[k]);
}
// for each point
for (size_t i = 0; i < point_count; ++i) {
vector_fpt_t* point = points[i];
// find closest cluster
cluster_fpt_t* cmin = clusters[0];
fpt_t dmin = distance_function(point, cmin->center);
for (size_t k = 0; k < nb_clusters; ++k) {
cluster_fpt_t* current_cluster = clusters[k];
fpt_t dist = distance_function(point, current_cluster->center);
if (dist < dmin) {
cmin = current_cluster;
dmin = dist;
}
}
// add point to closest cluster
cluster_add_point_fpt(cmin, point);
}
// update all cluster centers
for (size_t k = 0; k < nb_clusters; ++k) {
assert(clusters[k] != NULL);
assert(clusters[k]->points != NULL);
if (cluster_update_center_fpt(clusters[k])) {
changed = true;
}
}
}
}
#ifndef PROG_KMEANS_KMEANS_H
#define PROG_KMEANS_KMEANS_H
#include "cluster.h"
#include "linkedlist.h"
cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters);
cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters);
void kmeans_destroy_clusters_int(cluster_int_t** clusters, const size_t nb_clusters);
void kmeans_destroy_clusters_fpt(cluster_fpt_t** clusters, const size_t nb_clusters);
void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*));
void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*));
#endif //PROG_KMEANS_KMEANS_H
#include "linkedlist.h"
#include <assert.h>
#include <stdbool.h>
list_points_node_int_t* list_points_node_create_int(vector_int_t* point) {
if (NULL == point) return NULL;
list_points_node_int_t* node = malloc(sizeof(list_points_node_int_t));
if (NULL == node) return NULL;
node->point = point;
node->next = NULL;
return node;
}
list_points_node_fpt_t* list_points_node_create_fpt(vector_fpt_t* point) {
if (NULL == point) return NULL;
list_points_node_fpt_t* node = malloc(sizeof(list_points_node_fpt_t));
if (NULL == node) return NULL;
node->point = point;
node->next = NULL;
return node;
}
void list_points_node_destroy_int(list_points_node_int_t* node, const bool full) {
if (NULL == node) return;
if (full) vector_destroy_int(node->point);
free(node);
}
void list_points_node_destroy_fpt(list_points_node_fpt_t* node, const bool full) {
if (NULL == node) return;
if (full) vector_destroy_fpt(node->point);
free(node);
}
list_points_int_t* list_points_create_int() {
list_points_int_t* list = malloc(sizeof(list_points_int_t));
if (NULL == list) return NULL;
list->head = NULL;
list->tail = NULL;
list->size = 0;
return list;
}
list_points_fpt_t* list_points_create_fpt() {
list_points_fpt_t* list = malloc(sizeof(list_points_fpt_t));
if (NULL == list) return NULL;
list->head = NULL;
list->tail = NULL;
list->size = 0;
return list;
}
void list_points_destroy_int(list_points_int_t* list, const bool full) {
if (NULL == list) return;
list_points_node_int_t* node;
while ((node = list->head) != NULL) {
list->head = node->next;
list_points_node_destroy_int(node, full);
}
free(list);
}
void list_points_destroy_fpt(list_points_fpt_t* list, const bool full) {
if (NULL == list) return;
list_points_node_fpt_t* node;
while ((node = list->head) != NULL) {
list->head = node->next;
list_points_node_destroy_fpt(node, full);
}
free(list);
}
void list_points_append_int(list_points_int_t* list, vector_int_t* point) {
if (NULL == list || NULL == point) return;
list_points_node_int_t* node = list_points_node_create_int(point);
if (NULL == list->head) { // if list is empty
list->head = node;
list->tail = list->head;
} else {
list->tail->next = node;
list->tail = node;
}
++list->size;
}
void list_points_append_fpt(list_points_fpt_t* list, vector_fpt_t* point) {
if (NULL == list || NULL == point) return;
list_points_node_fpt_t* node = list_points_node_create_fpt(point);
if (NULL == list->head) { // if list is empty
list->head = node;
list->tail = list->head;
} else {
list->tail->next = node;
list->tail = node;
}
++list->size;
}
vector_int_t** list_points_to_array_int(const list_points_int_t* list) {
if (NULL == list) return NULL;
vector_int_t** a = calloc(list->size, sizeof(vector_int_t*));
if (NULL == a) return NULL;
list_points_node_int_t* cur = list->head;
size_t idx = 0;
while (cur != NULL) {
a[idx] = cur->point;
cur = cur->next;
++idx;
}
assert(idx == list->size);
return a;
}
vector_fpt_t** list_points_to_array_fpt(const list_points_fpt_t* list) {
if (NULL == list) return NULL;
vector_fpt_t** a = calloc(list->size, sizeof(vector_fpt_t*));
if (NULL == a) return NULL;
list_points_node_fpt_t* cur = list->head;
size_t idx = 0;
while (cur != NULL) {
a[idx] = cur->point;
cur = cur->next;
++idx;
}
assert(idx == list->size);
return a;
}
#ifndef PROG_KMEANS_LINKEDLIST_H
#define PROG_KMEANS_LINKEDLIST_H
#include <stdbool.h>
#include "vector.h"
typedef struct list_points_node_int {
vector_int_t* point;
struct list_points_node_int* next;
} list_points_node_int_t;
typedef struct list_points_node_fpt {
vector_fpt_t* point;
struct list_points_node_fpt* next;
} list_points_node_fpt_t;
list_points_node_int_t* list_points_node_create_int(vector_int_t* point);
list_points_node_fpt_t* list_points_node_create_fpt(vector_fpt_t* point);
void list_points_node_destroy_int(list_points_node_int_t* node, const bool full);
void list_points_node_destroy_fpt(list_points_node_fpt_t* node, const bool full);
typedef struct list_points_int {
list_points_node_int_t* head;
list_points_node_int_t* tail;
size_t size;
} list_points_int_t;
typedef struct list_points_fpt {
list_points_node_fpt_t* head;
list_points_node_fpt_t* tail;
size_t size;
} list_points_fpt_t;
list_points_int_t* list_points_create_int();
list_points_fpt_t* list_points_create_fpt();
void list_points_destroy_int(list_points_int_t* list, const bool full);
void list_points_destroy_fpt(list_points_fpt_t* list, const bool full);
void list_points_append_int(list_points_int_t* list, vector_int_t* vector);
void list_points_append_fpt(list_points_fpt_t* list, vector_fpt_t* vector);
vector_int_t** list_points_to_array_int(const list_points_int_t* list);
vector_fpt_t** list_points_to_array_fpt(const list_points_fpt_t* list);
#endif //PROG_KMEANS_LINKEDLIST_H
#include <assert.h>
#include <getopt.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "distance.h"
#include "io.h"
#include "kmeans.h"
#include "linkedlist.h"
#include "vector.h"
int main(int argc, char **argv) {
enum DistanceFunctionType {
EUCLID = 0, MANHATTAN = 1, CHEBYSHEV = 2
};
enum DataType {
FLOAT = 0, INT = 1
};
fpt_t (* const DIST_FUNC_INT[])(const vector_int_t*, const vector_int_t*) = {
distance_euclid_int,
distance_manhattan_int,
distance_chebyshev_int};
fpt_t (* const DIST_FUNC_FPT[])(const vector_fpt_t*, const vector_fpt_t*) = {
distance_euclid_fpt,
distance_manhattan_fpt,
distance_chebyshev_fpt};
void help(const char* callname) {
fprintf(stderr,
"\nUSAGE: %s -i INPUT_FILE -o OUTPUT_FILE -d [euclid,manhattan,chebyshev] -t [fpt,int]\n",
callname);
}
void parse_args(int argc, char** argv, char** ipath, char** opath, enum DistanceFunctionType* df, enum DataType* type) {
int opt;
while ((opt = getopt(argc, argv, "i:o:d:t:h")) != -1) {
switch (opt) {
case 'h':
help(argv[0]);
exit(EXIT_FAILURE);
case 'i':
*ipath = optarg;
break;
case 'o':
*opath = optarg;
break;
case 'd':
if (strcmp(optarg, "euclid") == 0) *df = EUCLID;
else if (strcmp(optarg, "manhattan") == 0) *df = MANHATTAN;
else if (strcmp(optarg, "chebyshev") == 0) *df = CHEBYSHEV;
break;
case 't':
if (strcmp(optarg, "fpt") == 0) *type = FLOAT;
else if (strcmp(optarg, "int") == 0) *type = INT;
break;
case '?':
fprintf(stderr, "UNKNOWN OPTION : %c", opt);
break;
default:
// https://www.gnu.org/software/libc/manual/html_node/Example-of-Getopt.html
abort();
}
}
}
int main_int(FILE* ifile, FILE* ofile, const size_t dim, const size_t nb_clusters, const enum DistanceFunctionType dist_func_type) {
// INIT
vector_int_t** points = NULL;
cluster_int_t** clusters = NULL;
// READ
list_points_int_t* list = io_get_vector_list_int(ifile, dim);
const size_t point_count = list->size;
points = list_points_to_array_int(list);
list_points_destroy_int(list, false);
list = NULL;
// ALGORITHM
printf("INIT: ");
clusters = kmeans_init_clusters_int((const vector_int_t**) points, point_count, nb_clusters);
printf("DONE\n");
printf("KMEANS: begin\n");
kmeans_int(points, point_count, clusters, nb_clusters, DIST_FUNC_INT[dist_func_type]);
printf("KMEANS: DONE !\n");
// WRITE
fprintf(ofile, "%lu\n%lu\n", dim, nb_clusters);
io_write_clusters_to_file_int(ofile, clusters, nb_clusters);
// CLEANUP
for (size_t i = 0; i < nb_clusters; ++i) cluster_destroy_int(clusters[i]);
for (size_t i = 0; i < point_count; ++i) vector_destroy_int(points[i]);
// EXIT
return EXIT_SUCCESS;
}
int main_fpt(FILE* ifile, FILE* ofile, const size_t dim, const size_t nb_clusters, const enum DistanceFunctionType dist_func_type) {
// INIT
vector_fpt_t** points = NULL;
cluster_fpt_t** clusters = NULL;
// READ
list_points_fpt_t* list = io_get_vector_list_fpt(ifile, dim);
const size_t point_count = list->size;
points = list_points_to_array_fpt(list);
list_points_destroy_fpt(list, false);
list = NULL;
// ALGORITHM
printf("INIT: ");
clusters = kmeans_init_clusters_fpt((const vector_fpt_t**) points, point_count, nb_clusters);
printf("DONE\n");
printf("KMEANS: begin\n");
kmeans_fpt(points, point_count, clusters, nb_clusters, DIST_FUNC_FPT[dist_func_type]);
printf("KMEANS: DONE !\n");
// WRITE
fprintf(ofile, "%lu\n%lu\n", dim, nb_clusters);
io_write_clusters_to_file_fpt(ofile, clusters, nb_clusters);
// CLEANUP
for (size_t i = 0; i < nb_clusters; ++i) cluster_destroy_fpt(clusters[i]);
free(clusters);
for (size_t i = 0; i < point_count; ++i) vector_destroy_fpt(points[i]);
free(points);
// EXIT
return EXIT_SUCCESS;
}
int main(int argc, char** argv) {
// init defaults
char (* ipath) = NULL, (* opath) = NULL;
enum DistanceFunctionType disttype = EUCLID;
enum DataType datatype = FLOAT;
// parse args
parse_args(argc, argv, &ipath, &opath, &disttype, &datatype);
// open files
FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin;
FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout;
// read dimension and desired number of clusters from file
const size_t dim = io_read_int(ifile);
const size_t nb_clusters = io_read_int(ifile);
if (0 == dim) {
printf("DIMENSION MUST BE STRICTLY POSITIVE !\n");
fclose(ifile);
fclose(ofile);
return EXIT_FAILURE;
}
if (0 == nb_clusters) {
printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n");
fclose(ifile);
fclose(ofile);
return EXIT_FAILURE;
}
// type specific code
int return_value = EXIT_FAILURE;
int (* main_routine)(FILE*, FILE*, const size_t, const size_t, const enum DistanceFunctionType);
main_routine = INT == datatype ? main_int : main_fpt;
printf(INT == datatype ? "TYPE: INT\n" : "TYPE: FLOAT\n");
assert(ifile != NULL);
assert(ofile != NULL);
return_value = main_routine(ifile, ofile, dim, nb_clusters, disttype);
// cleanup
fclose(ifile);
fclose(ofile);
return return_value;
}
#include "vector.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
vector_int_t* vector_create_int(const size_t dim) {
vector_int_t* v;
if ((v = malloc(sizeof(vector_int_t))) == NULL) return NULL;
v->dim = dim;
v->data = calloc(dim, sizeof(int_t));
return v;
}
vector_fpt_t* vector_create_fpt(const size_t dim) {
vector_fpt_t* v;
if ((v = malloc(sizeof(vector_fpt_t))) == NULL) return NULL;
v->dim = dim;
v->data = calloc(dim, sizeof(fpt_t));
return v;
}
void vector_destroy_int(vector_int_t* vp) {
if (NULL == vp) return;
free(vp->data);
free(vp);
}
void vector_destroy_fpt(vector_fpt_t* vp) {
if (NULL == vp) return;
free(vp->data);
free(vp);
}
vector_int_t* vector_copy_int(const vector_int_t* v) {
if (NULL == v) return NULL;
vector_int_t* c = vector_create_int(v->dim);
if (NULL == c) return NULL;
for (size_t i = 0; i < v->dim; ++i) {
c->data[i] = v->data[i];
}
return c;
}
vector_fpt_t* vector_copy_fpt(const vector_fpt_t* v) {
if (NULL == v) return NULL;
vector_fpt_t* c = vector_create_fpt(v->dim);
if (NULL == c) return NULL;
for (size_t i = 0; i < v->dim; ++i) {
c->data[i] = v->data[i];
}
return c;
}
bool vector_equals_int(const vector_int_t* v1, const vector_int_t* v2) {
if (v1->dim != v2->dim) return false;
for (size_t i = 0; i < v1->dim; ++i) {
if (v1->data[i] != v2->data[i]) {
return false;
}
}
return true;
}
bool vector_equals_fpt(const vector_fpt_t* v1, const vector_fpt_t* v2) {
if (v1->dim != v2->dim) return false;
for (size_t i = 0; i < v1->dim; ++i) {
if (v1->data[i] != v2->data[i]) {
return false;
}
}
return true;
}
void vector_print_to_file_int(FILE* file, const vector_int_t* v) {
fprintf(file, "%lud", v->data[0]);
for (size_t i = 1; i < v->dim; ++i) fprintf(file, " , %lud", v->data[i]);
fprintf(file, "\n");
}
void vector_print_to_file_fpt(FILE* file, const vector_fpt_t* v) {
fprintf(file, "%lf", v->data[0]);
for (size_t i = 1; i < v->dim; ++i) fprintf(file, " , %lf", v->data[i]);
fprintf(file, "\n");
}
void vector_add_inplace_int(vector_int_t* v, const vector_int_t a) {
if (NULL == v) return;
const size_t dim = v->dim < a.dim ? v->dim : a.dim;
for (size_t i = 0; i < dim; ++i) v->data[i] += a.data[i];
}
void vector_add_inplace_fpt(vector_fpt_t* v, const vector_fpt_t a) {
if (NULL == v) return;
const size_t dim = v->dim < a.dim ? v->dim : a.dim;
for (size_t i = 0; i < dim; ++i) v->data[i] += a.data[i];
}
void vector_div_inplace_int(vector_int_t* v, const int_t a) {
if (NULL == v) return;
for (size_t i = 0; i < v->dim; ++i) v->data[i] /= a;
}
void vector_div_inplace_fpt(vector_fpt_t* v, const fpt_t a) {
if (NULL == v) return;
for (size_t i = 0; i < v->dim; ++i) v->data[i] /= a;
}
void vector_print_int(const vector_int_t* v) {
if (NULL == v) printf("NULL");
else {
printf("%ld", v->data[0]);
for (size_t p = 1; p < v->dim; ++p) {
printf(" , %ld", v->data[p]);
}
printf("\n");
}
}
void vector_print_fpt(const vector_fpt_t* v) {
if (NULL == v) printf("NULL");
else {
printf("%lf", v->data[0]);
for (size_t p = 1; p < v->dim; ++p) {
printf(" , %lf", v->data[p]);
}
printf("\n");
}
}
#ifndef PROG_KMEANS_VECTOR_H
#define PROG_KMEANS_VECTOR_H
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
typedef struct vector_int {
size_t dim;
int_t* data;
} vector_int_t;
typedef struct vector_fpt {
size_t dim;
fpt_t* data;
} vector_fpt_t;
vector_int_t* vector_create_int(const size_t dim);
vector_fpt_t* vector_create_fpt(const size_t dim);
void vector_destroy_int(vector_int_t* vp);
void vector_destroy_fpt(vector_fpt_t* vp);
vector_int_t* vector_copy_int(const vector_int_t* v);
vector_fpt_t* vector_copy_fpt(const vector_fpt_t* v);
bool vector_equals_int(const vector_int_t* v1, const vector_int_t* v2);
bool vector_equals_fpt(const vector_fpt_t* v1, const vector_fpt_t* v2);
void vector_print_to_file_int(FILE* file, const vector_int_t* v);
void vector_print_to_file_fpt(FILE* file, const vector_fpt_t* v);
void vector_add_inplace_int(vector_int_t* v, const vector_int_t a);
void vector_add_inplace_fpt(vector_fpt_t* v, const vector_fpt_t a);
void vector_div_inplace_int(vector_int_t* v, const int_t a);
void vector_div_inplace_fpt(vector_fpt_t* v, const fpt_t a);
void vector_print_int(const vector_int_t* v);
void vector_print_fpt(const vector_fpt_t* v);
#endif //PROG_KMEANS_VECTOR_H