diff --git a/clear_json.py b/clear_json.py deleted file mode 100644 index f3a1e83224e7937854ec1a0d1540a724623f4abf..0000000000000000000000000000000000000000 --- a/clear_json.py +++ /dev/null @@ -1,24 +0,0 @@ -import re - -def pre_clean_json(input_path, output_path): - """ - Nettoie les annotations MongoDB type NumberInt(...) et les remplace par des entiers valides. - """ - pattern = re.compile(r'NumberInt\((\d+)\)') - - with open(input_path, 'r', encoding='utf-8') as fin, open(output_path, 'w', encoding='utf-8') as fout: - for line in fin: - cleaned_line = pattern.sub(r'\1', line) - fout.write(cleaned_line) - - print(f"✅ Fichier nettoyé sauvegardé dans : {output_path}") - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument("--infile", type=str, required=True, help="Fichier JSON d'entrée (brut)") - parser.add_argument("--outfile", type=str, default="clean_dblpv13.json", help="Fichier nettoyé de sortie") - - args = parser.parse_args() - pre_clean_json(args.infile, args.outfile) diff --git a/client/Dockerfile b/client/Dockerfile index fb5a3250a59193554f1f4bb61557e755172c3179..37852632c525ca35f57a87c0271ce62a58731b87 100644 --- a/client/Dockerfile +++ b/client/Dockerfile @@ -4,17 +4,15 @@ WORKDIR /app # Installer les dépendances COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +COPY clean.py . +COPY neo4j_import.py . + # Télécharger et extraire le fichier RUN apt-get update && apt-get install -y wget tar && \ - wget -O dblp.tar.gz https://originalfileserver.aminer.cn/misc/dblp_v14.tar.gz && \ - mkdir /app/dblp_data && \ - tar -xzf dblp.tar.gz -C /app/dblp_data && \ - rm dblp.tar.gz && \ - apt-get remove -y wget && apt-get clean && rm -rf /var/lib/apt/lists/* + apt install libjansson-dev -y && \ + mkdir /app/dblp_data -# Copier le script Python -COPY insert_script.py . +RUN pip install --no-cache-dir -r requirements.txt -CMD ["python", "insert_script.py"] +CMD sh -c "wget -O dblp.tar.gz https://originalfileserver.aminer.cn/misc/dblp_v14.tar.gz && tar -xzf dblp.tar.gz && python3 insert_script.py " \ No newline at end of file diff --git a/client/clean.py b/client/clean.py new file mode 100644 index 0000000000000000000000000000000000000000..ad77a50b81691ac161d38a40b5b1dbdf0ae98480 --- /dev/null +++ b/client/clean.py @@ -0,0 +1,43 @@ +# cleaner.py + +import re +import json +import ijson +import io + +__all__ = ["nettoyer_gros_json"] + +def pre_clean_json(input_path, output_path): + """ + Nettoie les annotations MongoDB type NumberInt(...) et les remplace par des entiers valides. + """ + print(f"🧹 Nettoyage du fichier : {input_path}") + pattern = re.compile(r'NumberInt\((\d+)\)') + + with open(input_path, 'r', encoding='utf-8') as fin, open(output_path, 'w', encoding='utf-8') as fout: + for line in fin: + cleaned_line = pattern.sub(r'\1', line) + fout.write(cleaned_line) + + print(f"✅ Fichier nettoyé sauvegardé dans : {output_path}") + + +def nettoyer_gros_json(input_path, output_path): + cleaned_path = "cleaned_" + input_path + + # Ensure cleaned file exists or use input_path directly if not modifying + # Uncomment the line below if you have a real cleaning step before parsing + pre_clean_json(input_path, cleaned_path) + + print("Begin clear") + with open(cleaned_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile: + outfile.write('[\n') + parser = ijson.items(infile, 'item') + first = True + for obj in parser: + filtered = {k: obj[k] for k in ['_id', 'title', 'authors', 'references'] if k in obj} + if not first: + outfile.write(',\n') + json.dump(filtered, outfile, ensure_ascii=False) + first = False + outfile.write('\n]') diff --git a/client/insert_script.py b/client/insert_script.py deleted file mode 100644 index bbc630bd4b6406067cf02aa294e31e3042cd61c4..0000000000000000000000000000000000000000 --- a/client/insert_script.py +++ /dev/null @@ -1,90 +0,0 @@ -import os -import ijson -import argparse -import time -from datetime import datetime -from concurrent.futures import ThreadPoolExecutor, as_completed -from py2neo import Graph -from tqdm import tqdm - -NEO4J_URI = os.getenv("NEO4J_URI") -NEO4J_USER = os.getenv("NEO4J_USER") -NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD") -BATCH_SIZE = 500 -MAX_WORKERS = 2 - -def batch_insert_articles(graph, articles): - query = """ - UNWIND $articles AS article - MERGE (a:ARTICLE { _id: article._id }) - SET a.title = article.title - - FOREACH (author IN article.authors | - FOREACH (_ IN CASE WHEN author.name IS NOT NULL THEN [1] ELSE [] END | - MERGE (p:AUTHOR { _id: coalesce(author._id, author.name) }) - SET p.name = author.name - MERGE (p)-[:AUTHORED]->(a) - ) - ) - - FOREACH (ref_id IN article.references | - MERGE (r:ARTICLE { _id: ref_id }) - MERGE (a)-[:CITES]->(r) - ) - """ - graph.run(query, articles=articles) - -def main(json_file, limit): - start_time = time.time() - print(f"⏱️ Début : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - - graph = Graph(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) - - print(f"📄 Lecture optimisée de {json_file} (limite: {limit})") - - with open(json_file, 'r', encoding='utf-8') as f: - article_iter = ijson.items(f, 'item') - total = 0 - futures = [] - - def flush_batch(batch): - if batch: - futures.append(executor.submit(batch_insert_articles, graph, list(batch))) - batch.clear() - - with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: - batch = [] - for article in tqdm(article_iter): - if limit and total >= limit: - break - batch.append(article) - total += 1 - - if len(batch) >= BATCH_SIZE: - flush_batch(batch) - time.sleep(0.1) - - # envoyer les derniers articles - flush_batch(batch) - - # attendre la fin de tous les threads - for future in tqdm(as_completed(futures), total=len(futures), desc="💾 Finalisation des insertions"): - future.result() - - end_time = time.time() - elapsed_ms = int((end_time - start_time) * 1000) - - print(f"✅ Import terminé") - print(f"⏱️ Fin : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - print(f"🕓 Durée totale : {elapsed_ms:,} ms") - print(f"⚡ Vitesse moyenne : {int(total / (elapsed_ms / 1000))} it/s") - -# --- CLI --- -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--file", type=str, default="clean_dblpv13.json", help="Chemin vers le fichier JSON nettoyé") - parser.add_argument("--limit", type=int, default=1000000, help="Nombre maximum d'articles à charger") - args = parser.parse_args() - - main(args.file, args.limit) - diff --git a/client/neo4j_import.py b/client/neo4j_import.py index 0adeaea5f12230a15699b9372836c7a6b3ef4490..e9523ae35527bb2b31b650313c952ade60509b19 100644 --- a/client/neo4j_import.py +++ b/client/neo4j_import.py @@ -1,18 +1,17 @@ +import os import ijson import argparse import time from datetime import datetime -from concurrent.futures import ThreadPoolExecutor, as_completed -from neo4j import GraphDatabase +from py2neo import Graph from tqdm import tqdm -NEO4J_URI = "bolt://localhost:7687" -NEO4J_USER = "neo4j" -NEO4J_PASSWORD = "12345678" -BATCH_SIZE = 10000 -MAX_WORKERS = 4 +NEO4J_URI = os.getenv("NEO4J_URI") +NEO4J_USER = os.getenv("NEO4J_USER") +NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD") +BATCH_SIZE = 500 -def batch_insert_articles(tx, articles): +def batch_insert_articles(graph, articles): query = """ UNWIND $articles AS article MERGE (a:ARTICLE { _id: article._id }) @@ -31,45 +30,40 @@ def batch_insert_articles(tx, articles): MERGE (a)-[:CITES]->(r) ) """ - tx.run(query, articles=articles) + graph.run(query, articles=articles) def main(json_file, limit): + clear_file = "clear_"+json_file + nettoyer_gros_json(json_file, clear_file) + json_file = clear_file + start_time = time.time() print(f"⏱️ Début : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) + graph = Graph(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) + print(f"📄 Lecture optimisée de {json_file} (limite: {limit})") with open(json_file, 'r', encoding='utf-8') as f: article_iter = ijson.items(f, 'item') total = 0 - futures = [] - - def flush_batch(batch): - if batch: - futures.append(executor.submit(session.execute_write, batch_insert_articles, batch)) - #print(f"📤 Batch de {len(batch)} articles envoyé") - batch.clear() # Vider explicitement le batch pour libérer de la mémoire + batch = [] - with driver.session() as session, ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: - batch = [] - for article in tqdm(article_iter): - if limit and total >= limit: - break - batch.append(article) - total += 1 + for article in tqdm(article_iter): + if limit and total >= limit: + break - if len(batch) >= BATCH_SIZE: - flush_batch(batch) + batch.append(article) + total += 1 - # envoyer les derniers articles - flush_batch(batch) + if len(batch) >= BATCH_SIZE: + batch_insert_articles(graph, batch) + batch.clear() - # attendre la fin de tous les threads - for future in tqdm(as_completed(futures), total=len(futures), desc="💾 Finalisation des insertions"): - future.result() + # insérer les derniers éléments + if batch: + batch_insert_articles(graph, batch) - driver.close() end_time = time.time() elapsed_ms = int((end_time - start_time) * 1000) @@ -78,11 +72,18 @@ def main(json_file, limit): print(f"🕓 Durée totale : {elapsed_ms:,} ms") print(f"⚡ Vitesse moyenne : {int(total / (elapsed_ms / 1000))} it/s") + # Wait for user input before closing + test = "" + while test != "exit": + test = input("Tapez 'exit' pour quitter ou appuyez sur Entrée pour continuer...") + + + # --- CLI --- if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--file", type=str, default="clean_dblpv13.json", help="Chemin vers le fichier JSON nettoyé") - parser.add_argument("--limit", type=int, default=1000000, help="Nombre maximum d'articles à charger") + parser.add_argument("--file", type=str, default="dblpv13.json", help="Chemin vers le fichier JSON nettoyé") + parser.add_argument("--limit", type=int, default=5000000, help="Nombre maximum d'articles à charger") args = parser.parse_args() main(args.file, args.limit) \ No newline at end of file diff --git a/client/requirement.txt b/client/requirements.txt similarity index 100% rename from client/requirement.txt rename to client/requirements.txt diff --git a/docker-compose.yml b/docker-compose.yml index 66243f35c1a1d98758e75f1c6e13ccede8677ee2..d6484fe1cfb95f061708f480d91b34eb4d957ddf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,11 +9,15 @@ services: - "7474:7474" - "7687:7687" environment: - - NEO4J_AUTH=neo4j/testtest + - NEO4J_AUTH=neo4j/12345678 volumes: - ${HOME}/neo4j/logs:/logs - ${HOME}/neo4j/data:/data - - ${HOME}/neo4j/import:/var/lib/neo4j/import + deploy: + resources: + limits: + memory: 3g + neo4j_client: build: @@ -25,4 +29,10 @@ services: environment: - NEO4J_URI=bolt://neo4j_db:7687 - NEO4J_USER=neo4j - - NEO4J_PASSWORD=testtest + - NEO4J_PASSWORD=12345678 + volumes: + - ${HOME}/neo4j/import:${HOME}/import + deploy: + resources: + limits: + memory: 3g diff --git a/neo4j/Dockerfile b/neo4j/Dockerfile index 30e060b151a5763351cd5edb0ae15f92bb7c0243..38a24b8aedea38efe21e00544646e4153835a022 100644 --- a/neo4j/Dockerfile +++ b/neo4j/Dockerfile @@ -3,14 +3,6 @@ FROM neo4j:latest USER root -RUN apt-get update && \ - apt-get install -y python3 python3-pip && \ - pip3 install neo4j tqdm ijson && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* +RUN apt-get update USER neo4j - -RUN cd ~ && \ - git clone https://gitedu.hesge.ch/lucas.landrecy/advdaba_labo2.git -