Skip to content
Snippets Groups Projects
Commit ca661662 authored by Leo Pellandini's avatar Leo Pellandini
Browse files

embedding

parent 8cb864ab
No related branches found
No related tags found
No related merge requests found
File added
{
"type": "service_account",
"project_id": "mse-test-project-436514",
"private_key_id": "2645e2680f535ae1246844ac7ecca7e6c1212fd6",
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDP72/azCfcLcj/\nh0IvTILjDUBEmAXTKcfm4q05WSko0ReS4t9qDA8WOpxZSQjOqj9R+QDJ7Z72iLIY\n0Z7ers5x9y6lZ227gP0IIur+gWsV003IDHdQvEK3X1rI4bX5XbUgAYsb0R9am7DN\n0cdiu3RFrf7/MQUbRkq1zR7ahDbgzTt3S6Bl/CifLJ2rstf5g3qMNPcC0KbnUqsc\n2utcJAFSa0EfX2KUeSOYsB3isA5GagEZkXueYqVIIxtegWi2qHWuzigeCIlCgrdj\nZAlxhR6RdsB03MAOsJF6B0Z4SlGZzPHB91KtMnEcvMduvTciSSFKXuM2YwqZg7VX\nTX0hUoOvAgMBAAECggEAFSCsCdxClJGmYahZpjqs9esLw6x6swkEwcX4eeJAV2Bf\nd9JLhxfwlH6iEkRke7M6udaGKP5W74/WIMVz0gaK/XNzLuVCdhHWI/SAUhnOSqps\ntc3mdbKbSMyMieq9Nbg6xiTCALKP8AHvxgnxq2uGlennBgDyFuJehvhvkR5sAQ1K\ngStlVbnejW8ZNRFrjkbaP1G9op2CacLrU/5S+Okr6AFcKFh5QmGiLESMiihJuuGZ\npvfMkNzrrA9K70g94twt06vEU2SiGHdBQ2cGUVZYXcsI+Avbqq+/pfj3WxfwXqqh\nDx/HzhiUmEPjE5exa0ArnwxuAeUBILqhMhTeNpfnWQKBgQDo6UDyu6Xvm9THjb5w\nSAiOCjZaGvCkTQZaedl2JWBtNO7H3W8Vccoll32HpHG7L6mIeLP9I2Lk+AUZOWhU\nlQLHy9ofToAs9ZSZpSyTAg1HKK/REMiU9eOez2yEQ5iWqKYXv79OJpyXM06uSx5/\nyz8T9ZQxz9qFzdMiiPbuWMVIAwKBgQDkjFqfeYsSolLGimuO4Sx6Pre5ObearXgP\noYUNwGODdkg4wm7zpJc2XiDBlL/iyW2Gyt4M2jTmJI+wKOWsGPTPOTMBk7cNLbMx\nDiGPaQXAG1XDtxYj2TKojoRBkbfJX63NI6vkKRL/vzMmbCJ2y1lKX0j65LTrwm8b\nGhIdn9Wz5QKBgQCFYYbjOxkFBe2ttfu4W1gi17BWm5Tx0nZv+9XQNglpoOWZqbLC\nyh5ktsOZmU/UTbA9yjnxHoG09GAfGOQphAhKmPA5+3+lv6Gw94l2SreF58P/6yej\nPslymgDgIcHRjZVIhnOs8qm8YRKO98/oiWF/MaUDfa/77moaHeujhUy9NwKBgQCM\nswNPTioZ7Kh85dZVfbY+A8JjW2724HgbV1psHtakpfrMRpa7k8YriEMuKX8ABPVS\nmC2fR+5tCHEVB/hsvGhp8lK+U8vLZyj7uDFc8lDB9ZIVDO+qXhpbvnEZVLYKWMbM\nlXtK2SaDH5hDvSpya7mqmYJ6QrZGtcpkquYgKrgLKQKBgQDmooLfchORwvl0szmB\nXkpz1B52UT860cIVnfvatm6ImPqwSPGrDKJDgpbeoDaMKf2Z/pmLxWtFIzJQRXew\n53U1d2diEGprBzUhQUBQju1bLcpQkPYyVov7ZYudahOijt8pj35Zz0HsyFkDYQvv\nnRn2cosZM+uzYP9QlVgGIAS2Ig==\n-----END PRIVATE KEY-----\n",
"client_email": "vertexai@mse-test-project-436514.iam.gserviceaccount.com",
"client_id": "103535310171085862136",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/vertexai%40mse-test-project-436514.iam.gserviceaccount.com",
"universe_domain": "googleapis.com"
}
# Creator: Abir Chebbi (abir.chebbi@hesge.ch)
import boto3
import os
import argparse
from google.cloud import storage
from google.cloud import aiplatform
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import BedrockEmbeddings
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from langchain_community.vectorstores import OpenSearchVectorSearch
import argparse
## S3_client
s3_client = boto3.client('s3')
## Bedrock client
bedrock_client = boto3.client(service_name="bedrock-runtime")
# Configuration de l'authentification Google Cloud
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-account-file.json"
## Configuration for AWS authentication and OpenSearch client
credentials = boto3.Session().get_credentials()
awsauth = AWSV4SignerAuth(credentials, 'us-east-1', 'aoss')
# Configuration du client Google Cloud Storage
storage_client = storage.Client()
## Create Index in Opensearch
def create_index(client,index_name):
indexBody = {
"settings": {
"index.knn": True
},
"mappings": {
"properties": {
"vector_field": {
"type": "knn_vector",
"dimension": 1536,
"method": {
"engine": "faiss",
"name": "hnsw"
}
}
}
}
}
try:
create_response = client.indices.create(index_name, body=indexBody)
print('\nCreating index:')
print(create_response)
except Exception as e:
print(e)
print("(Index likely already exists?)")
## Load docs from S3
# Fonction pour télécharger des documents depuis Google Cloud Storage
def download_documents(bucket_name, local_dir):
response = s3_client.list_objects_v2(Bucket=bucket_name)
for item in response['Contents']:
key = item['Key']
if key.endswith('.pdf'):
local_filename = os.path.join(local_dir, key)
s3_client.download_file(Bucket=bucket_name, Key=key, Filename=local_filename)
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs()
for blob in blobs:
if blob.name.endswith('.pdf'):
local_filename = os.path.join(local_dir, blob.name)
blob.download_to_filename(local_filename)
print(f'Downloaded {blob.name} to {local_filename}')
# Fonction pour diviser les pages/textes en morceaux
def split_text(docs, chunk_size, chunk_overlap):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunks = text_splitter.split_documents(docs)
return chunks
# Fonction pour générer des embeddings
def generate_embeddings(texts):
# Initialiser Vertex AI
aiplatform.init(project="mse-test-project-436514", location="us-central1")
# Remplacer par l'ID de votre endpoint
endpoint_id = "2223196018688655360" # Remplacez par l'ID réel
# Obtenez l'endpoint pour générer des embeddings
endpoint = aiplatform.Endpoint(endpoint_id)
## Split pages/text into chunks
def split_text(docs, chunk_size, chunk_overlap):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunks = text_splitter.split_documents(docs)
# Créez les instances avec le champ "inputs" attendu
instances = [{"inputs": text} for text in texts]
return chunks
# Générez les embeddings via Vertex AI
response = endpoint.predict(instances=instances)
## Generate embeddings
def generate_embeddings(bedrock_client, chunks):
embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
chunks_list=[chunk.page_content for chunk in chunks]
embeddings = embeddings_model.embed_documents(chunks_list)
return embeddings
# Store generated embeddings into an OpenSearch index.
def store_embeddings(embeddings, texts, meta_data, host, awsauth, index_name):
docsearch = OpenSearchVectorSearch.from_embeddings(
embeddings,
texts,
meta_data,
opensearch_url=f'https://{host}:443',
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection,
index_name=index_name,
bulk_size=1000
)
return docsearch
# Func to do both generating and storing embeddings
def generate_store_embeddings(bedrock_client, chunks,awsauth,index_name):
embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
docsearch = OpenSearchVectorSearch.from_documents(
chunks,
embeddings_model,
opensearch_url=f'https://{host}:443',
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection,
index_name=index_name,
bulk_size=1000
)
return docsearch
## main
def main(bucket_name, endpoint,index_name, local_path):
## Opensearch Client
OpenSearch_client = OpenSearch(
hosts=[{'host': endpoint, 'port': 443}],
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection,
)
# Récupérer et retourner les embeddings
return response.predictions
# Fonction principale
def main(bucket_name, index_name, local_path):
download_documents(bucket_name, local_path)
# Charger les documents
loader = PyPDFDirectoryLoader(local_path)
docs = loader.load()
print('Start chunking')
chunks = split_text(docs, 1000, 100)
print(chunks[1])
create_index(OpenSearch_client,index_name)
print('Start vectorising')
embeddings= generate_embeddings(bedrock_client, chunks)
print(embeddings[1])
texts = [chunk.page_content for chunk in chunks]
# Prepare metadata for each chunk
meta_data = [{'source': chunk.metadata['source'], 'page': chunk.metadata['page'] + 1} for chunk in chunks]
print('Start storing')
store_embeddings(embeddings, texts, meta_data ,endpoint, awsauth,index_name)
print('End storing')
print('Start vectorizing')
embeddings = generate_embeddings(texts)
# Logique de stockage ou de traitement des embeddings
print('Embeddings generated:', embeddings)
print('End processing')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process PDF documents and store their embeddings.")
parser.add_argument("--bucket_name", help="The S3 bucket name where documents are stored")
parser.add_argument("--endpoint", help="The OpenSearch service endpoint")
parser.add_argument("--index_name", help="The name of the OpenSearch index")
parser.add_argument("--local_path", help="local path")
parser.add_argument("--bucket_name", help="The GCS bucket name where documents are stored")
parser.add_argument("--index_name", help="The name of the index for storing embeddings (if applicable)")
parser.add_argument("--local_path", help="Local path to store downloaded files")
args = parser.parse_args()
main(args.bucket_name, args.endpoint, args.index_name, args.local_path)
main(args.bucket_name, args.index_name, args.local_path)
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment