Skip to content
Snippets Groups Projects
Commit d2167c32 authored by Boris Stefanovic's avatar Boris Stefanovic
Browse files

ADD: cluster centers initialisation

parent 21de63b2
Branches
No related tags found
No related merge requests found
......@@ -6,18 +6,82 @@ theme: "Frankfurt"
geometry: "margin=40mm"
mainfont: DejaVu Sans
header-includes:
- \usepackage{float}
- \let\origfigure\figure
- \let\endorigfigure\endfigure
- \renewenvironment{figure}[1][2] {\expandafter\origfigure\expandafter[H]} {\endorigfigure}
---
\newpage
# Structures de Données
## Point
- chaque point est un vecteur
- types entiers et virgule flottante séparés
- "common.h" contient les définitions de `int_t` et `fpt_t`
```c
typedef struct vector_int_t_ {
size_t dim;
int_t* data;
} vector_int_t;
typedef struct vector_fpt_t_ {
size_t dim;
fpt_t* data;
} vector_fpt_t;
```
## Faciliter l'Association à un Cluster
- structure vecteur vue précédemment générale
- on veut associer chaque point à un cluster
- le cluster auquel chaque point appartient change au cours de l'algorithme
- stocker les points "dans" des structures "cluster" est peu judicieux
- on stocke un identifiant de cluster (un pointeur) dans une structure "point de cluster"
## Cluster
- un cluster peut être représenté par
- un identifiant: un pointeur, forcémment unique
- son centre: un point virtuel, la valeur derrière le pointeur
# Décisions
```c
typedef vector_int_t* cluster_int_t;
```
## Point de Cluster
```c
typedef struct cluster_point_int {
const vector_int_t* vector;
cluster_int_t* cluster;
} cluster_point_int_t;
```
## Ensemble de Points
- parcours répétés de l'ensemble de tous les points
- pas d'ordre particulier (sauf à l'initialisation des centroïdes)
- une liste simplement chaînée fera l'affaire
```c
typedef struct ll_point_int_node {
const cluster_point_int_t* point;
struct ll_point_int_node* next;
} ll_point_int_node_t;
typedef struct ll_point_int {
ll_point_int_node_t* head;
ll_point_int_node_t* tail;
size_t size;
} ll_point_int_t;
```
# Algorithmique
# Implémentation
# Démonstration
......@@ -3,6 +3,7 @@
//
#include "cluster.h"
#include <stdlib.h>
#include "vector.h"
......@@ -12,3 +13,10 @@ cluster_point_int_t* cluster_point_int_create(vector_int_t* vector) {
elem->vector = vector;
elem->cluster = NULL;
}
void cluster_point_int_destroy(cluster_point_int_t* cp) {
if (NULL == cp) return;
vector_int_destroy(cp->vector);
free(cp);
}
......@@ -8,14 +8,16 @@
#include "vector.h"
typedef vector_int_t cluster_int_t; // a cluster may be represented by its center
typedef vector_int_t* cluster_int_t; // a cluster may be represented by its center
typedef struct cluster_point_int {
vector_int_t* vector;
cluster_int_t* cluster; // justified by "many-to-one" relationship and several passes over all points
cluster_int_t cluster; // justified by "many-to-one" relationship and several passes over all points
} cluster_point_int_t;
cluster_point_int_t* cluster_point_int_create(vector_int_t* vector);
void cluster_point_int_destroy(cluster_point_int_t* cp);
#endif //PROG_KMEANS_CLUSTER_H
#include "common.h"
#include <stdbool.h>
#include <stdlib.h>
#include <time.h>
bool randinit = false;
inline void init_rand() {
srand(time(NULL));
randinit = true;
}
int rand_int(const int max) {
if (!randinit) init_rand();
return rand() % max;
}
int rand_int_range(int min, int max) {
if (min > max) {
int swap = min;
min = max;
max = swap;
}
return min + rand_int(max - min);
}
double rand_double_range_one() {
if (!randinit) init_rand();
return ((double) rand()) / ((double) RAND_MAX);
}
double rand_double_range(double min, double max) {
if (min > max) {
double swap = min;
min = max;
max = swap;
}
return min + rand_double_range_one() * (max - min);
}
......@@ -12,4 +12,13 @@ typedef int64_t int_t;
typedef double fpt_t;
int rand_int(const int max);
int rand_int_range(int min, int max);
double rand_double_range_one();
double rand_double_range(double min, double max);
#endif //PROG_KMEANS_COMMON_H
......@@ -27,7 +27,7 @@ fpt_t distance_euclid_int(const vector_int_t* p1, const vector_int_t* p2) {
int_t item = diff * diff;
acc += item;
}
return sqrt((double) acc);
return sqrt((fpt_t) acc);
}
fpt_t distance_manhattan_int(const vector_int_t* p1, const vector_int_t* p2) {
......@@ -38,7 +38,7 @@ fpt_t distance_manhattan_int(const vector_int_t* p1, const vector_int_t* p2) {
int_t item = diff >= 0 ? diff : -diff;
acc += item;
}
return (double) acc;
return (fpt_t) acc;
}
fpt_t distance_chebyshev_int(const vector_int_t* p1, const vector_int_t* p2) {
......@@ -49,38 +49,38 @@ fpt_t distance_chebyshev_int(const vector_int_t* p1, const vector_int_t* p2) {
item = abs_diff_int(p1->data[i], p2->data[i]);
if (item > max) max = item;
}
return (double) max;
return (fpt_t) max;
}
fpt_t distance_euclid_fpt(const vector_int_t* p1, const vector_int_t* p2) {
fpt_t distance_euclid_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2) {
if (p1->dim != p2->dim)return ERROR;
int_t acc = 0;
fpt_t acc = 0;
for (size_t i = 0; i < p1->dim; ++i) {
int_t diff = p2->data[i] - p1->data[i];
int_t item = diff * diff;
fpt_t diff = p2->data[i] - p1->data[i];
fpt_t item = diff * diff;
acc += item;
}
return sqrt((double) acc);
return sqrt((fpt_t) acc);
}
fpt_t distance_manhattan_fpt(const vector_int_t* p1, const vector_int_t* p2) {
fpt_t distance_manhattan_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2) {
if (p1->dim != p2->dim)return ERROR;
int_t acc = 0;
fpt_t acc = 0;
for (size_t i = 0; i < p1->dim; ++i) {
int_t diff = p2->data[i] - p1->data[i];
int_t item = diff >= 0 ? diff : -diff;
fpt_t diff = p2->data[i] - p1->data[i];
fpt_t item = diff >= 0 ? diff : -diff;
acc += item;
}
return (double) acc;
return (fpt_t) acc;
}
fpt_t distance_chebyshev_fpt(const vector_int_t* p1, const vector_int_t* p2) {
fpt_t distance_chebyshev_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2) {
if (p1->dim != p2->dim)return ERROR;
int_t max = ERROR;
int_t item;
fpt_t max = ERROR;
fpt_t item;
for (size_t i = 0; i < p1->dim; ++i) {
item = abs_diff_int(p1->data[i], p2->data[i]);
item = abs_diff_fpt(p1->data[i], p2->data[i]);
if (item > max) max = item;
}
return (double) max;
return (fpt_t) max;
}
......@@ -4,3 +4,39 @@
#include "kmeans.h"
#include "cluster.h"
cluster_int_t* kmeans_init_clusters_int(const cluster_point_int_t** points, const size_t point_count, const size_t nclusters) {
if (nclusters < 2) return NULL;
if (NULL == points) return NULL;
cluster_int_t* clusters = calloc(nclusters, sizeof(cluster_int_t));
if (NULL == clusters) return NULL;
// determine range in which we are working
vector_int_t* min = vector_int_copy(points[0]->vector);
vector_int_t* max = vector_int_copy(points[0]->vector);
for (size_t i = 0; i < point_count; ++i) {
for (size_t p = 0; p < max->dim; ++p) {
const int_t value = points[i]->vector->data[p];
if (value < min->data[p]) min->data[p] = value;
if (value > max->data[p]) max->data[p] = value;
}
}
// until we have enough centers
for (size_t i = 0; i < nclusters; ++i) {
cluster_int_t center = vector_int_create(max->dim);
for (size_t p = 0; p < center->dim; ++p) {
center->data[p] = rand_int_range(min->data[p], max->data[p]);
}
// TODO: maybe check center is not already in clusters, although probability is extremely low...
clusters[i] = center;
}
return clusters;
}
void kmeans_int(
cluster_point_int_t** points, const size_t point_count,
cluster_int_t* clusters, const size_t nb_clusters,
fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)) {
//TODO
}
......@@ -5,8 +5,16 @@
#ifndef PROG_KMEANS_KMEANS_H
#define PROG_KMEANS_KMEANS_H
#include "cluster.h"
#include "linkedlist.h"
//
cluster_int_t* kmeans_init_clusters_int(const cluster_point_int_t** points, const size_t point_count, const size_t nclusters);
void kmeans_int(
cluster_point_int_t** points, const size_t point_count,
cluster_int_t* clusters, const size_t nb_clusters,
fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*));
#endif //PROG_KMEANS_KMEANS_H
......@@ -3,34 +3,76 @@
//
#include "linkedlist.h"
#include <assert.h>
#include <stdbool.h>
#include "cluster.h"
ll_vint_node_t* ll_vint_create_node(const vector_int_t* vec) {
ll_vint_node_t* node = malloc(sizeof(ll_vint_node_t));
ll_point_int_node_t* ll_point_int_create_node(vector_int_t* vec) {
ll_point_int_node_t* node = malloc(sizeof(ll_point_int_node_t));
if (NULL == node) return NULL;
node->data = vec;
cluster_point_int_t* point = cluster_point_int_create(vec);
if (NULL == point) return NULL;
node->point = point;
node->next = NULL;
return node;
}
ll_vint_t* ll_vint_create() {
ll_vint_t* ll = NULL;
ll = malloc(sizeof(ll_vint_t));
void ll_point_int_destroy_node(ll_point_int_node_t* node, const bool full) {
if (NULL == node) return;
if (full) cluster_point_int_destroy(node->point);
free(node);
}
ll_point_int_t* ll_point_int_create() {
ll_point_int_t* ll = NULL;
ll = malloc(sizeof(ll_point_int_t));
if (NULL == ll) return NULL;
ll->head = NULL;
ll->tail = NULL;
ll->size = 0;
return ll;
}
void ll_vint_append(ll_vint_t* list, const vector_int_t* vector) {
void ll_point_int_destroy(ll_point_int_t* list, const bool full) {
if (NULL == list) return;
ll_point_int_node_t* node;
while ((node = list->head) != NULL) {
list->head = node->next;
ll_point_int_destroy_node(node, full);
}
free(list);
}
void ll_point_int_append(ll_point_int_t* list, vector_int_t* vector) {
if (NULL == vector) return;
ll_vint_node_t* node = ll_vint_create_node(vector);
if (NULL == list->head) {
ll_point_int_node_t* node = ll_point_int_create_node(vector);
if (NULL == list->head) { // if list is empty
list->head = node;
list->tail = list->head;
list->head->next = NULL;
} else {
// TODO
list->tail->next = node;
list->tail = node;
}
list->size++;
}
cluster_point_int_t** ll_point_int_to_array(const ll_point_int_t* list, size_t* size_ptr) {
cluster_point_int_t** a = calloc(list->size, sizeof(cluster_point_int_t*));
if (NULL == a) return NULL;
ll_point_int_node_t* cur = list->head;
size_t idx = 0;
while (cur != NULL) {
a[idx] = cur->point;
cur = cur->next;
++idx;
}
assert(idx == list->size);
if (size_ptr != NULL) *size_ptr = list->size;
return a;
}
......@@ -5,25 +5,33 @@
#ifndef PROG_KMEANS_LINKEDLIST_H
#define PROG_KMEANS_LINKEDLIST_H
#include <stdbool.h>
#include "cluster.h"
#include "vector.h"
typedef struct ll_vector_int_node {
const vector_int_t* data;
struct ll_vector_int_node* next;
} ll_vint_node_t;
typedef struct ll_point_int_node {
cluster_point_int_t* point;
struct ll_point_int_node* next;
} ll_point_int_node_t;
typedef struct ll_vector_int {
ll_vint_node_t* head;
ll_vint_node_t* tail;
typedef struct ll_point_int {
ll_point_int_node_t* head;
ll_point_int_node_t* tail;
size_t size;
} ll_vint_t;
} ll_point_int_t;
ll_vint_node_t* ll_vint_create_node(const vector_int_t* vec);
ll_point_int_node_t* ll_point_int_create_node(vector_int_t* vec);
ll_vint_t* ll_vint_create();
void ll_point_int_destroy_node(ll_point_int_node_t* node, const bool full);
void ll_vint_append(ll_vint_t* list, const vector_int_t* vector);
ll_point_int_t* ll_point_int_create();
void ll_point_int_destroy(ll_point_int_t* list, const bool full);
void ll_point_int_append(ll_point_int_t* list, vector_int_t* vector);
cluster_point_int_t** ll_point_int_to_array(const ll_point_int_t* list, size_t* size_ptr);
#endif //PROG_KMEANS_LINKEDLIST_H
#define _GNU_SOURCE
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "common.h"
#include "kmeans.h"
#include "linkedlist.h"
#include "vector.h"
void help(const char* callname) {
fprintf(stderr, "\nUSAGE: %s <INPUT_FILE> <OUTPUT_FILE>\n", callname);
}
int_t read_int(FILE* file) {
char* line;
size_t len;
......@@ -20,45 +22,28 @@ int_t read_int(FILE* file) {
return strtol(line, NULL, 10);
}
bool read_vector_int(FILE* file, vector_int_t* vector) {
// procure line
char* line = NULL;
size_t len = 0;
getline(&line, &len, file);
if (len == 0) return false;
// tokenise
char* toktgt = line;
char* token = NULL;
for (size_t i = 0; i < vector->dim; ++i, toktgt = NULL) {
token = strtok(toktgt, ",");
// strtol returns 0 if number not read; desired behaviour:
vector->data[i] = token != NULL ? strtol(token, NULL, 10) : 0;
}
free(line);
return true;
}
vector_int_t* line_to_vector_int(char* line, const size_t dim) {
vector_int_t* vector = vector_int_create_zero(dim);
vector_int_t* vector = vector_int_create(dim);
char* tgt = line;
char* token = NULL;
for (size_t i = 0; i < vector->dim; ++i, tgt = NULL) {
token = strtok(tgt, ",");
// strtol returns 0 if number not read; desired behaviour:
// strtol returns 0 if number not read, which is the desired behaviour:
vector->data[i] = token != NULL ? strtol(token, NULL, 10) : 0;
}
return vector;
}
ll_vint_t* get_vector_list(FILE* ifile, const size_t dim) {
ll_vint_t* list = ll_vint_create();
ll_point_int_t* get_vector_list_int(FILE* ifile, const size_t dim) {
ll_point_int_t* list = ll_point_int_create();
char* line = NULL;
size_t len = 0;
while (getline(&line, &len, ifile) != -1) {
if (len != 0) {
vector_int_t* vector = line_to_vector_int(line, dim);
ll_vint_append(list, vector);
ll_point_int_append(list, vector);
free(line);
}
}
......@@ -81,18 +66,24 @@ int main(int argc, char** argv) {
// READ
FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin;
const size_t dim = read_int(ifile);
const int_t nclusters = read_int(ifile);
const size_t cluster_count = read_int(ifile); // k
if (0 <= dim) {
printf("DIMENSION MUST BE STRICTLY POSITIVE !\n");
return EXIT_FAILURE;
}
if (0 == nclusters) {
if (0 <= cluster_count) {
printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n");
return EXIT_FAILURE;
}
ll_vint_t* list = get_vector_list(ifile, dim);
ll_point_int_t* list = get_vector_list_int(ifile, dim);
size_t count;
const cluster_point_int_t** points = ll_point_int_to_array(list, &count);
ll_point_int_destroy(list, false);
list = NULL;
// ALGORITHM
// TODO
// init clusters
cluster_int_t* clusters = kmeans_init_clusters_int(points, count, cluster_count);
// WRITE
FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout;
// TODO
......
......@@ -3,37 +3,70 @@
//
#include "vector.h"
#include <stdbool.h>
#include <stdlib.h>
vector_int_t* vector_int_create(const size_t dim, const int_t* data) {
vector_int_t* vector_int_create(const size_t dim) {
vector_int_t* v;
if ((v = calloc(dim, sizeof(int_t))) == NULL) return NULL;
if ((v = malloc(sizeof(vector_int_t))) == NULL) return NULL;
v->dim = dim;
for (size_t i = 0; i < dim; ++i) v->data[i] = data[i];
v->data = calloc(dim, sizeof(int_t));
return v;
}
vector_fpt_t* vector_fpt_create(const size_t dim, const fpt_t* data) {
vector_fpt_t* vector_fpt_create(const size_t dim) {
vector_fpt_t* v;
if ((v = calloc(dim, sizeof(fpt_t))) == NULL) return NULL;
if ((v = malloc(sizeof(vector_fpt_t))) == NULL) return NULL;
v->dim = dim;
for (size_t i = 0; i < dim; ++i) v->data[i] = data[i];
v->data = calloc(dim, sizeof(fpt_t));
return v;
}
vector_int_t* vector_int_create_zero(const size_t dim) {
vector_int_t* v;
if ((v = calloc(dim, sizeof(int_t))) == NULL) return NULL;
v->dim = dim;
for (size_t i = 0; i < dim; ++i) v->data[i] = 0;
return v;
void vector_int_destroy(vector_int_t* vp) {
if (NULL == vp) return;
free(vp->data);
free(vp);
}
vector_fpt_t* vector_fpt_create_zero(const size_t dim) {
vector_fpt_t* v;
if ((v = calloc(dim, sizeof(fpt_t))) == NULL) return NULL;
v->dim = dim;
for (size_t i = 0; i < dim; ++i) v->data[i] = 0.0;
return v;
void vector_fpt_destroy(vector_int_t* vp) {
if (NULL == vp) return;
free(vp->data);
free(vp);
}
vector_int_t* vector_int_copy(const vector_int_t* v) {
if (NULL == v) return NULL;
vector_int_t* c = vector_int_create(v->dim);
if (NULL == c) return NULL;
for (size_t i = 0; i < v->dim; ++i) {
c->data[i] = v->data[i];
}
return c;
}
bool vector_int_equals(const vector_int_t* v1, const vector_int_t* v2) {
if (v1->dim != v2->dim) return false;
for (size_t i = 0; i < v1->dim; ++i) {
if (v1->data[i] != v2->data[i]) {
return false;
}
}
return true;
}
bool vector_fpt_equals(const vector_fpt_t* v1, const vector_fpt_t* v2) {
if (v1->dim != v2->dim) return false;
for (size_t i = 0; i < v1->dim; ++i) {
if (v1->data[i] != v2->data[i]) {
return false;
}
}
return true;
}
......@@ -10,6 +10,7 @@
* e.g. scientific measurements (floating point) and image data (integer).
*/
#include <stdbool.h>
#include <stdlib.h>
#include "common.h"
......@@ -24,13 +25,19 @@ typedef struct vector_fpt_t_ {
fpt_t* data;
} vector_fpt_t;
vector_int_t* vector_int_create(size_t dim, const int_t* data);
vector_int_t* vector_int_create(const size_t dim);
vector_fpt_t* vector_fpt_create(size_t dim, const fpt_t* data);
vector_fpt_t* vector_fpt_create(const size_t dim);
vector_int_t* vector_int_create_zero(size_t dim);
void vector_int_destroy(vector_int_t* vp);
vector_fpt_t* vector_fpt_create_zero(size_t dim);
void vector_fpt_destroy(vector_int_t* vp);
vector_int_t* vector_int_copy(const vector_int_t* v);
bool vector_int_equals(const vector_int_t* v1, const vector_int_t* v2);
bool vector_fpt_equals(const vector_fpt_t* v1, const vector_fpt_t* v2);
#endif //PROG_KMEANS_VECTOR_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment