bon code

787f30a0 · lucas.landrecy · 94879607 · 787f30a0 · 787f30a0 · 94879607
Commit 787f30a0 authored 1 month ago by lucas.landrecy
--- a/client/Dockerfile
+++ b/client/Dockerfile
@@ -4,17 +4,15 @@ WORKDIR /app

 # Installer les dépendances
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+COPY clean.py .
+COPY neo4j_import.py .
+

 # Télécharger et extraire le fichier
 RUN apt-get update && apt-get install -y wget tar && \
-    wget -O dblp.tar.gz https://originalfileserver.aminer.cn/misc/dblp_v14.tar.gz && \
-    mkdir /app/dblp_data && \
-    tar -xzf dblp.tar.gz -C /app/dblp_data && \
-    rm dblp.tar.gz && \
-    apt-get remove -y wget && apt-get clean && rm -rf /var/lib/apt/lists/*
+    apt install libjansson-dev -y && \
+    mkdir /app/dblp_data

-# Copier le script Python
-COPY insert_script.py .
+RUN pip install --no-cache-dir -r requirements.txt

-CMD ["python", "insert_script.py"]
+CMD sh -c "wget -O dblp.tar.gz https://originalfileserver.aminer.cn/misc/dblp_v14.tar.gz && tar -xzf dblp.tar.gz && python3 insert_script.py "
\ No newline at end of file
--- a/clear_json.py
+++ b/clear_json.py
+# cleaner.py
+
 import re
+import json
+import ijson
+import io
+
+__all__ = ["nettoyer_gros_json"]

 def pre_clean_json(input_path, output_path):
    """
    Nettoie les annotations MongoDB type NumberInt(...) et les remplace par des entiers valides.
    """
+    print(f"🧹 Nettoyage du fichier : {input_path}")
    pattern = re.compile(r'NumberInt\((\d+)\)')

    with open(input_path, 'r', encoding='utf-8') as fin, open(output_path, 'w', encoding='utf-8') as fout:
@@ -13,12 +21,23 @@ def pre_clean_json(input_path, output_path):

    print(f"✅ Fichier nettoyé sauvegardé dans : {output_path}")

-if __name__ == "__main__":
-    import argparse

-    parser = argparse.ArgumentParser()
-    parser.add_argument("--infile", type=str, required=True, help="Fichier JSON d'entrée (brut)")
-    parser.add_argument("--outfile", type=str, default="clean_dblpv13.json", help="Fichier nettoyé de sortie")
+def nettoyer_gros_json(input_path, output_path):
+    cleaned_path = "cleaned_" + input_path
+
+    # Ensure cleaned file exists or use input_path directly if not modifying
+    # Uncomment the line below if you have a real cleaning step before parsing
+    pre_clean_json(input_path, cleaned_path)

-    args = parser.parse_args()
-    pre_clean_json(args.infile, args.outfile)
+    print("Begin clear")
+    with open(cleaned_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
+        outfile.write('[\n')
+        parser = ijson.items(infile, 'item')
+        first = True
+        for obj in parser:
+            filtered = {k: obj[k] for k in ['_id', 'title', 'authors', 'references'] if k in obj}
+            if not first:
+                outfile.write(',\n')
+            json.dump(filtered, outfile, ensure_ascii=False)
+            first = False
+        outfile.write('\n]')
--- a/client/insert_script.py
+++ b/client/insert_script.py
-import os
-import ijson
-import argparse
-import time
-from datetime import datetime
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from py2neo import Graph
-from tqdm import tqdm
-
-NEO4J_URI = os.getenv("NEO4J_URI")
-NEO4J_USER = os.getenv("NEO4J_USER")
-NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
-BATCH_SIZE = 500
-MAX_WORKERS = 2
-
-def batch_insert_articles(graph, articles):
-    query = """
-    UNWIND $articles AS article
-    MERGE (a:ARTICLE { _id: article._id })
-    SET a.title = article.title
-
-    FOREACH (author IN article.authors |
-        FOREACH (_ IN CASE WHEN author.name IS NOT NULL THEN [1] ELSE [] END |
-            MERGE (p:AUTHOR { _id: coalesce(author._id, author.name) })
-            SET p.name = author.name
-            MERGE (p)-[:AUTHORED]->(a)
-        )
-    )
-
-    FOREACH (ref_id IN article.references |
-        MERGE (r:ARTICLE { _id: ref_id })
-        MERGE (a)-[:CITES]->(r)
-    )
-    """
-    graph.run(query, articles=articles)
-
-def main(json_file, limit):
-    start_time = time.time()
-    print(f"⏱️ Début : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-
-    graph = Graph(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
-
-    print(f"📄 Lecture optimisée de {json_file} (limite: {limit})")
-
-    with open(json_file, 'r', encoding='utf-8') as f:
-        article_iter = ijson.items(f, 'item')
-        total = 0
-        futures = []
-
-        def flush_batch(batch):
-            if batch:
-                futures.append(executor.submit(batch_insert_articles, graph, list(batch)))
-                batch.clear()
-
-        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
-            batch = []
-            for article in tqdm(article_iter):
-                if limit and total >= limit:
-                    break
-                batch.append(article)
-                total += 1
-
-                if len(batch) >= BATCH_SIZE:
-                    flush_batch(batch)
-                    time.sleep(0.1)
-
-            # envoyer les derniers articles
-            flush_batch(batch)
-
-            # attendre la fin de tous les threads
-            for future in tqdm(as_completed(futures), total=len(futures), desc="💾 Finalisation des insertions"):
-                future.result()
-
-    end_time = time.time()
-    elapsed_ms = int((end_time - start_time) * 1000)
-
-    print(f"✅ Import terminé")
-    print(f"⏱️ Fin   : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    print(f"🕓 Durée totale : {elapsed_ms:,} ms")
-    print(f"⚡ Vitesse moyenne : {int(total / (elapsed_ms / 1000))} it/s")
-
-# --- CLI ---
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--file", type=str, default="clean_dblpv13.json", help="Chemin vers le fichier JSON nettoyé")
-    parser.add_argument("--limit", type=int, default=1000000, help="Nombre maximum d'articles à charger")
-    args = parser.parse_args()
-
-    main(args.file, args.limit)
-
--- a/client/neo4j_import.py
+++ b/client/neo4j_import.py
+import os
 import ijson
 import argparse
 import time
 from datetime import datetime
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from neo4j import GraphDatabase
+from py2neo import Graph
 from tqdm import tqdm

-NEO4J_URI = "bolt://localhost:7687"
-NEO4J_USER = "neo4j"
-NEO4J_PASSWORD = "12345678"
-BATCH_SIZE = 10000
-MAX_WORKERS = 4
+NEO4J_URI = os.getenv("NEO4J_URI")
+NEO4J_USER = os.getenv("NEO4J_USER")
+NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
+BATCH_SIZE = 500

-def batch_insert_articles(tx, articles):
+def batch_insert_articles(graph, articles):
    query = """
    UNWIND $articles AS article
    MERGE (a:ARTICLE { _id: article._id })
@@ -31,45 +30,40 @@ def batch_insert_articles(tx, articles):
        MERGE (a)-[:CITES]->(r)
    )
    """
-    tx.run(query, articles=articles)
+    graph.run(query, articles=articles)

 def main(json_file, limit):
+    clear_file = "clear_"+json_file
+    nettoyer_gros_json(json_file, clear_file)
+    json_file = clear_file
+
    start_time = time.time()
    print(f"⏱️ Début : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

-    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
+    graph = Graph(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
+
    print(f"📄 Lecture optimisée de {json_file} (limite: {limit})")

    with open(json_file, 'r', encoding='utf-8') as f:
        article_iter = ijson.items(f, 'item')
        total = 0
-        futures = []
-
-        def flush_batch(batch):
-            if batch:
-                futures.append(executor.submit(session.execute_write, batch_insert_articles, batch))
-                #print(f"📤 Batch de {len(batch)} articles envoyé")
-                batch.clear()  # Vider explicitement le batch pour libérer de la mémoire
-
-        with driver.session() as session, ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        batch = []
+
        for article in tqdm(article_iter):
            if limit and total >= limit:
                break
+
            batch.append(article)
            total += 1

            if len(batch) >= BATCH_SIZE:
-                    flush_batch(batch)
-
-            # envoyer les derniers articles
-            flush_batch(batch)
+                batch_insert_articles(graph, batch)
+                batch.clear()

-            # attendre la fin de tous les threads
-            for future in tqdm(as_completed(futures), total=len(futures), desc="💾 Finalisation des insertions"):
-                future.result()
+        # insérer les derniers éléments
+        if batch:
+            batch_insert_articles(graph, batch)

-    driver.close()
    end_time = time.time()
    elapsed_ms = int((end_time - start_time) * 1000)

@@ -78,11 +72,18 @@ def main(json_file, limit):
    print(f"🕓 Durée totale : {elapsed_ms:,} ms")
    print(f"⚡ Vitesse moyenne : {int(total / (elapsed_ms / 1000))} it/s")

+    # Wait for user input before closing
+    test = ""
+    while test != "exit":
+        test = input("Tapez 'exit' pour quitter ou appuyez sur Entrée pour continuer...")
+    
+
+
 # --- CLI ---
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--file", type=str, default="clean_dblpv13.json", help="Chemin vers le fichier JSON nettoyé")
-    parser.add_argument("--limit", type=int, default=1000000, help="Nombre maximum d'articles à charger")
+    parser.add_argument("--file", type=str, default="dblpv13.json", help="Chemin vers le fichier JSON nettoyé")
+    parser.add_argument("--limit", type=int, default=5000000, help="Nombre maximum d'articles à charger")
    args = parser.parse_args()

    main(args.file, args.limit)
\ No newline at end of file
--- a/client/requirement.txt
+++ b/client/requirement.txt
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -9,11 +9,15 @@ services:
      - "7474:7474"
      - "7687:7687"
    environment:
-      - NEO4J_AUTH=neo4j/testtest
+      - NEO4J_AUTH=neo4j/12345678
    volumes:
      - ${HOME}/neo4j/logs:/logs
      - ${HOME}/neo4j/data:/data
-      - ${HOME}/neo4j/import:/var/lib/neo4j/import
+    deploy:
+      resources:
+        limits:
+          memory: 3g
+

  neo4j_client:
    build:
@@ -25,4 +29,10 @@ services:
    environment:
      - NEO4J_URI=bolt://neo4j_db:7687
      - NEO4J_USER=neo4j
-      - NEO4J_PASSWORD=testtest
+      - NEO4J_PASSWORD=12345678
+    volumes:
+      - ${HOME}/neo4j/import:${HOME}/import
+    deploy:
+      resources:
+        limits:
+          memory: 3g
--- a/neo4j/Dockerfile
+++ b/neo4j/Dockerfile
@@ -3,14 +3,6 @@ FROM neo4j:latest

 USER root

-RUN apt-get update && \
-    apt-get install -y python3 python3-pip && \
-    pip3 install neo4j tqdm ijson && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+RUN apt-get update

 USER neo4j
-
-RUN cd ~ && \
-    git clone https://gitedu.hesge.ch/lucas.landrecy/advdaba_labo2.git
-