Skip to content
Snippets Groups Projects
Commit 94879607 authored by lucas.landrecy's avatar lucas.landrecy
Browse files

modification

parent 1760db78
Branches master
No related tags found
No related merge requests found
FROM python:3.10-slim
WORKDIR /app
# Installer les dépendances
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Télécharger et extraire le fichier
RUN apt-get update && apt-get install -y wget tar && \
wget -O dblp.tar.gz https://originalfileserver.aminer.cn/misc/dblp_v14.tar.gz && \
mkdir /app/dblp_data && \
tar -xzf dblp.tar.gz -C /app/dblp_data && \
rm dblp.tar.gz && \
apt-get remove -y wget && apt-get clean && rm -rf /var/lib/apt/lists/*
# Copier le script Python
COPY insert_script.py .
CMD ["python", "insert_script.py"]
import os
import ijson
import argparse
import time
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from neo4j import GraphDatabase
from py2neo import Graph
from tqdm import tqdm
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "testtest"
BATCH_SIZE = 1000
MAX_WORKERS = 4
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
BATCH_SIZE = 500
MAX_WORKERS = 2
def batch_insert_articles(tx, articles):
def batch_insert_articles(graph, articles):
query = """
UNWIND $articles AS article
MERGE (a:ARTICLE { _id: article._id })
......@@ -31,29 +32,28 @@ def batch_insert_articles(tx, articles):
MERGE (a)-[:CITES]->(r)
)
"""
tx.run(query, articles=articles)
graph.run(query, articles=articles)
def main(json_file, limit):
start_time = time.time()
print(f"⏱️ Début : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
graph = Graph(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
print(f"📄 Lecture optimisée de {json_file} (limite: {limit})")
with open(json_file, 'r', encoding='utf-8') as f:
article_iter = ijson.items(f, 'item')
batch = []
total = 0
futures = []
def flush_batch():
nonlocal batch, futures
def flush_batch(batch):
if batch:
futures.append(executor.submit(session.execute_write, batch_insert_articles, batch))
print(f"📤 Batch de {len(batch)} articles envoyé")
batch = []
futures.append(executor.submit(batch_insert_articles, graph, list(batch)))
batch.clear()
with driver.session() as session, ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
batch = []
for article in tqdm(article_iter):
if limit and total >= limit:
break
......@@ -61,16 +61,16 @@ def main(json_file, limit):
total += 1
if len(batch) >= BATCH_SIZE:
flush_batch()
flush_batch(batch)
time.sleep(0.1)
# envoyer les derniers articles
flush_batch()
flush_batch(batch)
# attendre la fin de tous les threads
for future in tqdm(as_completed(futures), total=len(futures), desc="💾 Finalisation des insertions"):
future.result()
driver.close()
end_time = time.time()
elapsed_ms = int((end_time - start_time) * 1000)
......@@ -87,3 +87,4 @@ if __name__ == "__main__":
args = parser.parse_args()
main(args.file, args.limit)
File moved
tqdm
ijson
argparse
py2neo
version: '3.8'
services:
advdaba_labo2:
build:
context: .
dockerfile: Dockerfile
image: advdaba_labo2_img # <- image customisée
container_name: advdaba_labo2
neo4j_db:
image: neo4j:latest
container_name: neo4j_db
restart: unless-stopped
ports:
- "7474:7474"
......@@ -15,7 +14,15 @@ services:
- ${HOME}/neo4j/logs:/logs
- ${HOME}/neo4j/data:/data
- ${HOME}/neo4j/import:/var/lib/neo4j/import
deploy:
resources:
limits:
memory: 3g
neo4j_client:
build:
context: ./client
dockerfile: Dockerfile
container_name: neo4j_client
depends_on:
- neo4j_db
environment:
- NEO4J_URI=bolt://neo4j_db:7687
- NEO4J_USER=neo4j
- NEO4J_PASSWORD=testtest
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment