diff --git a/pokedex_EfficientNetV2M.py b/Python/pokedex_EfficientNetV2M.py similarity index 67% rename from pokedex_EfficientNetV2M.py rename to Python/pokedex_EfficientNetV2M.py index b9fd017804d02d7508a44a62701f498c71c63386..d6207c1b9ae6edd096a3059c443e8a2f4d8ffd12 100644 --- a/pokedex_EfficientNetV2M.py +++ b/Python/pokedex_EfficientNetV2M.py @@ -16,32 +16,8 @@ num_classes = 151 base_batch_size = 32 base_lr = 1e-3 -# --- Auto-Tune Batch Size --- -def find_max_batch_size(data_dir, image_size, candidate_sizes=[256, 128, 64, 32, 16]): - print("Tuning batch size...") - for bs in candidate_sizes: - try: - print(f"Trying global batch size {bs}...") - ds = keras.utils.image_dataset_from_directory( - data_dir, - labels="inferred", - label_mode="int", - image_size=image_size, - batch_size=bs, - shuffle=True - ) - for batch in ds.take(1): # Try to load one batch - tf.print("✓ Batch size", bs, "works") - del ds - gc.collect() - tf.keras.backend.clear_session() - return bs - except tf.errors.ResourceExhaustedError: - print(f"✗ Batch size {bs} too large.") - raise RuntimeError("No suitable batch size found.") - -global_batch_size = find_max_batch_size(data_dir, image_size) -scaled_lr = base_lr * (global_batch_size / base_batch_size) +global_batch_size = 32 +scaled_lr = min(base_lr * (global_batch_size / base_batch_size), 1e-3) # --- Load Dataset --- full_ds = keras.utils.image_dataset_from_directory( @@ -107,7 +83,8 @@ with strategy.scope(): # --- Train --- callbacks = [ - keras.callbacks.ModelCheckpoint("EfficientNetV2M/save_at_{epoch}.keras") + keras.callbacks.ModelCheckpoint("/home/users/d/divia/EfficientNetV2M/save_at_{epoch}.keras"), + keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True) ] model.fit( diff --git a/pokedex_ResNet50.py b/Python/pokedex_ResNet50.py similarity index 66% rename from pokedex_ResNet50.py rename to Python/pokedex_ResNet50.py index 9c16244c427f887056aa072ed2725305ec3f2ffc..9659d76a850b341c54a251ea529a35c3cfc9484d 100644 --- a/pokedex_ResNet50.py +++ b/Python/pokedex_ResNet50.py @@ -16,32 +16,8 @@ num_classes = 151 base_batch_size = 32 base_lr = 1e-3 -# --- Auto-Tune Batch Size --- -def find_max_batch_size(data_dir, image_size, candidate_sizes=[256, 128, 64, 32, 16]): - print("Tuning batch size...") - for bs in candidate_sizes: - try: - print(f"Trying global batch size {bs}...") - ds = keras.utils.image_dataset_from_directory( - data_dir, - labels="inferred", - label_mode="int", - image_size=image_size, - batch_size=bs, - shuffle=True - ) - for batch in ds.take(1): # Try to load one batch - tf.print("✓ Batch size", bs, "works") - del ds - gc.collect() - tf.keras.backend.clear_session() - return bs - except tf.errors.ResourceExhaustedError: - print(f"✗ Batch size {bs} too large.") - raise RuntimeError("No suitable batch size found.") - -global_batch_size = find_max_batch_size(data_dir, image_size) -scaled_lr = base_lr * (global_batch_size / base_batch_size) +global_batch_size = 32 +scaled_lr = min(base_lr * (global_batch_size / base_batch_size), 1e-3) # --- Load Dataset --- full_ds = keras.utils.image_dataset_from_directory( @@ -107,12 +83,13 @@ with strategy.scope(): # --- Train --- callbacks = [ - keras.callbacks.ModelCheckpoint("ResNet50/save_at_{epoch}.keras") + keras.callbacks.ModelCheckpoint("/home/users/d/divia/ResNet50/save_at_{epoch}.keras"), + keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True) ] model.fit( train_ds, validation_data=val_ds, - epochs=10, + epochs=15, callbacks=callbacks ) \ No newline at end of file diff --git a/models/efficientv2m.keras b/models/efficientv2m.keras new file mode 100644 index 0000000000000000000000000000000000000000..1c7275d712f61cf9369aeda45df72fc9bd05ce98 Binary files /dev/null and b/models/efficientv2m.keras differ diff --git a/models/resnet50.keras b/models/resnet50.keras new file mode 100644 index 0000000000000000000000000000000000000000..ca1c556508e3ffa175c3d3a81015954eaef6fad8 Binary files /dev/null and b/models/resnet50.keras differ diff --git a/simple_xception.keras b/models/simple_xception.keras similarity index 100% rename from simple_xception.keras rename to models/simple_xception.keras diff --git a/slurm/EfficientNetV2M_16335505.out b/slurm/EfficientNetV2M_16335505.out new file mode 100644 index 0000000000000000000000000000000000000000..70b7e0d14ebfaf1fbe199a83208f3dda13ea7067 --- /dev/null +++ b/slurm/EfficientNetV2M_16335505.out @@ -0,0 +1,97 @@ +2025-04-02 09:24:18.434290: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA +To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2025-04-02 09:24:26.222632: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.222943: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.500324: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.500659: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.501020: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.501261: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.506807: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA +To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2025-04-02 09:24:26.678433: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.678718: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.679091: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.679427: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.679750: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:26.679999: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:28.142087: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:28.142406: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:28.144113: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:28.144367: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:28.144608: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:28.144853: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 79383 MB memory: -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:41:00.0, compute capability: 8.0 +2025-04-02 09:24:28.145150: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero +2025-04-02 09:24:28.145375: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 79383 MB memory: -> device: 1, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:81:00.0, compute capability: 8.0 +WARNING:tensorflow:Using a while_loop for converting RngReadAndSkip cause there is no registered converter for this op. +WARNING:tensorflow:Using a while_loop for converting Bitcast cause there is no registered converter for this op. +WARNING:tensorflow:Using a while_loop for converting Bitcast cause there is no registered converter for this op. +WARNING:tensorflow:Using a while_loop for converting StatelessRandomUniformV2 cause there is no registered converter for this op. +WARNING:tensorflow:Using a while_loop for converting ImageProjectiveTransformV3 cause there is no registered converter for this op. +WARNING:tensorflow:Using a while_loop for converting RngReadAndSkip cause there is no registered converter for this op. +WARNING:tensorflow:Using a while_loop for converting Bitcast cause there is no registered converter for this op. +WARNING:tensorflow:Using a while_loop for converting Bitcast cause there is no registered converter for this op. +WARNING:tensorflow:Using a while_loop for converting StatelessRandomUniformV2 cause there is no registered converter for this op. +WARNING:tensorflow:Using a while_loop for converting ImageProjectiveTransformV3 cause there is no registered converter for this op. +2025-04-02 09:25:00.926897: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1" +op: "TensorSliceDataset" +input: "Placeholder/_0" +attr { + key: "Toutput_types" + value { + list { + type: DT_STRING + } + } +} +attr { + key: "_cardinality" + value { + i: 25511 + } +} +attr { + key: "is_files" + value { + b: false + } +} +attr { + key: "metadata" + value { + s: "\n\024TensorSliceDataset:0" + } +} +attr { + key: "output_shapes" + value { + list { + shape { + } + } + } +} +attr { + key: "replicate_on_split" + value { + b: false + } +} +experimental_type { + type_id: TFT_PRODUCT + args { + type_id: TFT_DATASET + args { + type_id: TFT_PRODUCT + args { + type_id: TFT_TENSOR + args { + type_id: TFT_STRING + } + } + } + } +} + +Number of GPUs: 2 +Found 25511 files belonging to 151 classes. +Epoch 1/10 diff --git a/slurm/train_EfficientNetV2M.sh b/slurm/train_EfficientNetV2M.sh new file mode 100644 index 0000000000000000000000000000000000000000..207757225027be1444ee7604e80cbaedfc038082 --- /dev/null +++ b/slurm/train_EfficientNetV2M.sh @@ -0,0 +1,17 @@ +#!/bin/sh +#SBATCH --job-name=keras_cnn_efficientnetv2m +#SBATCH --output=EfficientNetV2M_%j.out +#SBATCH --partition=shared-gpu +#SBATCH --gres=gpu:2,VramPerGpu:80G +#SBATCH --cpus-per-task=2 +#SBATCH --mem=16G +#SBATCH --time=01:00:00 +#SBATCH --mail-type=FAIL + +# Load modules +module purge +module load GCC/11.3.0 OpenMPI/4.1.4 TensorFlow/2.11.0-CUDA-11.7.0 +module load cuDNN/8.4.1.50-CUDA-11.7.0 + +# Run your script +srun python ../Python/pokedex_EfficientNetV2M.py diff --git a/slurm/train_ResNet50.sh b/slurm/train_ResNet50.sh new file mode 100644 index 0000000000000000000000000000000000000000..f545644d058abf1d2738d181933c7703c73c06ea --- /dev/null +++ b/slurm/train_ResNet50.sh @@ -0,0 +1,17 @@ +#!/bin/sh +#SBATCH --job-name=keras_cnn_resnet50 +#SBATCH --output=ResNet50_%j.out +#SBATCH --partition=shared-gpu +#SBATCH --gres=gpu:2,VramPerGpu:40G +#SBATCH --cpus-per-task=2 +#SBATCH --mem=16G +#SBATCH --time=01:00:00 +#SBATCH --mail-type=FAIL + +# Load modules +module purge +module load GCC/11.3.0 OpenMPI/4.1.4 TensorFlow/2.11.0-CUDA-11.7.0 +module load cuDNN/8.4.1.50-CUDA-11.7.0 + +# Run your script +srun python ../Python/pokedex_ResNet50.py