Skip to content
Snippets Groups Projects
Commit f4f85471 authored by Ivan Pavlovich's avatar Ivan Pavlovich
Browse files

Modification du script de récupération des données d'API PubMed et script de...

Modification du script de récupération des données d'API PubMed et script de stokage des articles de pubmed en masse en locale
parent 88696137
Branches
Tags
No related merge requests found
File added
[]
\ No newline at end of file
......@@ -7,6 +7,8 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
from requests import get
from parsers.xmlParser import parseXmlFile
import json
from variables.pubmed import PUBMED_API_KEY
import xmltodict
TMP_DIR_NAME = "./tmp"
TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), TMP_DIR_NAME))
......@@ -28,9 +30,14 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
print(f"Date min: {date_min}")
print(f"Date max: {date_max}")
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y'
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&api_key={PUBMED_API_KEY}&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y'
while(True):
try:
response = get(url)
break
except Exception as e:
print(e)
search_res = response.json()
......@@ -42,35 +49,81 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}"
while(True):
try:
response = get(url)
break
except Exception as e:
print(e)
with open(f"{TMP_DIR}/{TMP_FILENAME}", "w+", encoding="utf-8") as file:
file.write(response.text)
obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}")
# obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}")
obj = xmltodict.parse(response.text)
obj = obj["PubmedArticleSet"]
print()
data_list = []
for key in obj.keys():
for key in obj:
if isinstance(obj[key], list):
i = 0
for entrie in obj[key]:
if "MedlineCitation" in entrie:
print("---------------------------------------------------------")
if "MeshHeadingList" in entrie["MedlineCitation"]:
data = {}
data["PMID"] = entrie["MedlineCitation"]["PMID"]
data["PMID"] = entrie["MedlineCitation"]["PMID"]["#text"]
data["Title"] = ""
if isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], list):
for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]:
if "#text" in part:
data["Title"] += part["#text"]
else:
data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]
data["ArticleTitle"] = ""
if isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], list):
for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]:
if "#text" in part:
data["ArticleTitle"] += part["#text"]
else:
data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]
data["Abstract"] = ""
if "Abstract" in entrie["MedlineCitation"]["Article"] :
if isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], list):
for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]:
if "#text" in part:
data["Abstract"] += part["#text"]
else:
data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
data["Predictions"] = []
data["MeshTerms"] = []
if isinstance(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"], list):
for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]:
data["MeshTerms"].append(meshTerm["DescriptorName"])
data["MeshTerms"].append(meshTerm["DescriptorName"]["#text"])
else:
data["MeshTerms"].append(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]["DescriptorName"]["#text"])
for date in entrie["PubmedData"]["History"]["PubMedPubDate"]:
if date["@PubStatus"] == "pubmed":
data["Date"] = {
"Year": date["Year"],
"Month": date["Month"],
"Day": date["Day"]
}
break
print(data)
if debug:
print(f"Index: {obj[key].index(entrie)}")
......
import sys
import os
from datetime import datetime, timedelta
import time
import json
# Ajouter le répertoire parent au chemin de recherche
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
from dataSources.PubMed.pubmedApi import getPubmedData
from variables.pubmed import *
from dataSources.PubMed.util import *
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data"))
ncds_mesh_noexp = get_mesh_noexp_term(NCDS_MESH_TERM)
search_term = url_encode(" OR ".join(ncds_mesh_noexp))
data = []
with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file:
json.dump(data, json_file, indent=4)
current_date = datetime(2022, 1, 1)
while(current_date < datetime(2024, 12, 31)):
next_date = current_date + timedelta(weeks=1)
data += getPubmedData(search_term, current_date.strftime("%Y/%m/%d"), next_date.strftime("%Y/%m/%d"))
current_date = next_date
time.sleep(0.1)
with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file:
json.dump(data, json_file, indent=4)
\ No newline at end of file
from transformers import AutoTokenizer
# Choose a tokenizer (e.g., GPT-2, BERT, T5, etc.)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Your text
text = "Hugging Face makes NLP easy!"
# Tokenize and count tokens
tokens = tokenizer(text, return_tensors="pt") # You can also use return_tensors="tf" or "np"
num_tokens = len(tokens["input_ids"][0])
print(f"Number of tokens: {num_tokens}")
\ No newline at end of file
File added
......@@ -3,10 +3,12 @@ import xml.etree.ElementTree as ET
def xml_to_obj(xml_element):
res = {}
if xml_element.attrib:
res["@attributes"] = xml_element.attrib
text = xml_element.text.strip() if xml_element.text and xml_element.text.strip() else None
for child in xml_element:
if child.text:
res[child.tag] = child.text
else:
child_dict = xml_to_obj(child)
if child.tag in res:
......@@ -17,10 +19,12 @@ def xml_to_obj(xml_element):
else:
res[child.tag] = child_dict
if text and not res:
return text
return res
def parseXmlFile(filename):
tree = ET.parse(filename)
root = tree.getroot()
return xml_to_obj(root)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment