Skip to content
Snippets Groups Projects
Commit 64a49f18 authored by abir.chebbi's avatar abir.chebbi
Browse files

Adjust the readme

parent a5728da8
No related branches found
No related tags found
No related merge requests found
......@@ -24,7 +24,7 @@ def write_files(s3_client, directory, bucket):
Key=filename
)
print(f"{filename} uploaded successfully.")
def main(bucket_name, local_dir):
s3_client = boto3.client('s3')
create_bucket(s3_client, bucket_name)
......@@ -32,8 +32,8 @@ def main(bucket_name, local_dir):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Upload PDF files to an S3 bucket")
parser.add_argument("bucket_name", help="The name of the S3 bucket to which the files will be uploaded")
parser.add_argument("LOCAL_DIR", help="The name of the folder to put the pdf files")
parser.add_argument("--bucket_name", help="The name of the S3 bucket to which the files will be uploaded")
parser.add_argument("--local_path", help="The name of the folder to put the pdf files")
args = parser.parse_args()
main(args.bucket_name, args.LOCAL_DIR)
main(args.bucket_name, args.local_path)
......@@ -152,7 +152,7 @@ def main(collection_name,IAM_USER):
if __name__== "__main__":
parser = argparse.ArgumentParser(description="Create collection")
parser.add_argument("collection_name", help="The name of the collection")
parser.add_argument("iam_user", help="The iam user")
parser.add_argument("--collection_name", help="The name of the collection")
parser.add_argument("--iam_user", help="The iam user")
args = parser.parse_args()
main(args.collection_name,args.iam_user)
......@@ -165,8 +165,8 @@ def main(bucket_name, endpoint,index_name):
if __name__== "__main__":
parser = argparse.ArgumentParser(description="Process PDF documents and store their embeddings.")
parser.add_argument("bucket_name", help="The S3 bucket name where documents are stored")
parser.add_argument("endpoint", help="The OpenSearch service endpoint")
parser.add_argument("index_name", help="The name of the OpenSearch index")
parser.add_argument("--bucket_name", help="The S3 bucket name where documents are stored")
parser.add_argument("--endpoint", help="The OpenSearch service endpoint")
parser.add_argument("--index_name", help="The name of the OpenSearch index")
args = parser.parse_args()
main(args.bucket_name, args.endpoint, args.index_name)
......@@ -7,8 +7,9 @@ from langchain.chains import RetrievalQA
from langchain_community.embeddings import BedrockEmbeddings
from langchain_community.chat_models import BedrockChat
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from langchain import PromptTemplate
import argparse
# Embeddings Client
bedrock_client = boto3.client(service_name="bedrock-runtime")
......@@ -24,26 +25,26 @@ st.set_page_config(
st.title("Chat with your lecture")
# AWS and OpenSearch Configuration
host = 'd7gvxdj7jpz3h3bj0xq6.us-east-1.aoss.amazonaws.com'
index_name = 'cloud_lecture'
awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss')
# OpenSearch Client
opensearch_client = OpenSearch(
hosts=[{'host': host, 'port': 443}],
def ospensearch_client(endpoint):
awsauth = AWSV4SignerAuth(boto3.Session().get_credentials(), 'us-east-1', 'aoss')
client = OpenSearch(
hosts=[{'host': endpoint, 'port': 443}],
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection,
)
)
return client
def get_embedding(question, bedrock_client):
embeddings_model = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=bedrock_client)
embedding = embeddings_model.embed_query(question)
return embedding
def similarity_search(embed_query, index_name):
def similarity_search(client, embed_query, index_name):
query_body = {
"size": 5,
"query": {
......@@ -55,7 +56,7 @@ def similarity_search(embed_query, index_name):
}
}
}
response = opensearch_client.search(index=index_name, body=query_body)
response = client.search(index=index_name, body=query_body)
return response['hits']['hits']
def prepare_prompt(question, context):
......@@ -85,7 +86,9 @@ def generate_answer(prompt):
return answer
def main():
def main(endpoint, index_name):
oss_client= ospensearch_client(endpoint)
# initialize chat session in streamlit if not already present
if "chat_history" not in st.session_state:
......@@ -110,7 +113,7 @@ def main():
embed_question= get_embedding(user_prompt,bedrock_client)
print(embed_question)
sim_results = similarity_search(embed_question, index_name)
sim_results = similarity_search(oss_client, embed_question, index_name)
context = [i['_source']['text'] for i in sim_results]
print(context)
prompt = prepare_prompt(user_prompt, context)
......@@ -122,6 +125,11 @@ def main():
st.markdown(message["content"])
if __name__== "__main__":
main()
# Argument parsing
parser = argparse.ArgumentParser(description='Configure endpoint and index name for the lecture chat application.')
parser.add_argument('endpoint', type=str, help='The endpoint for the OpenSearch service.')
parser.add_argument('index_name', type=str, help='The index name for storing embeddings.')
args = parser.parse_args()
main(args.endpoint, args.index_name)
# chatbot-lab
## Set up environment
1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to Session 1)
2. Ensure python is installed: python 3.8 or higher
2. Install required python libraries listed in the 'requirements.txt':
1. AWS CLI: Ensure AWS CLI is installed and configured on your laptop(refer to the setup guide provided in Session 1).
2. Ensure python is installed: python 3.8 or higher.
3. Install required python libraries listed in the 'requirements.txt':
`pip3 install -r requirements.txt`
## Part 1:
### Step 1: Object storage Creation
Create an S3 bucket and upload a few PDF files (Detailed steps are provided in the first session).
Create an S3 bucket and upload a few PDF files by running:
`python create-S3-and-put-docs.py --bucket_name [YourBucketName] --local_path [PathToYourPDFFiles]`
Where:
`--bucket_name`: The name for the new S3 bucket to be created.
`--local_path`: The local directory path where the PDF files are stored.
### Step 2: Vector Store Creation
To set up the Vector Store, run the following command: `python create-vector-db.py`
Create a vector database for storing embeddings by running:
`python create-vector-db.py --collection_name [Name_of_colletion] --IAM_user [YourIAM_User]`
Where:
`--collection_name`: Name of the collection that you want to create to store embeddings.
`--IAM_USER` : For example for group 14 the IAM USER = master-group-14
This script performs the following actions:
* Set up the security policies: Sets up encryption, network, and data access policies for collections starting with "test".
* Vector Store Initialization: Creates a vector store named test1, specifically designed for vector search operations.
* Endpoint Retrieval: After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use.
* Sets up encryption, network, and data access policies for the collection.
* Creates a vector store with the name collection entered as argument.
* After the vector store is set up, the script retrieves and displays the store's endpoint for immediate use.
### Step 3: Vectorizing the PDF Files
After setting up the S3 bucket and Vector Store, prepare to vectorize the PDF files:
* In main.py, update the S3 bucket name to the one you created.
* Update the Vector Store endpoint with the one provided by the setup script.
* Execute the processing script: `python main.py`
After setting up the S3 bucket and Vector Store, we could process PDF files to generate and store embeddings in the vector database.
Run:
`python main.py --bucket_name [YourBucketName] --endpoint [YourVectorDBEndpoint]`
Where:
`--bucket_name`: The name of the S3 bucket containing the PDF files.
`--endpoint`: Endpoint for the vector database.
`--index_name`: The index_name where to store the embeddings in the collection.
The main.py script will:
1. Download PDF files from the S3 bucket.
2. Split them into chunks.
3. Generate embeddings from the chunks.
4. Store these embeddings in the OpenSearch Vector DB.
4. Create an index in the vector DB.
5. Store these embeddings in the OpenSearch Vector DB.
## Part 2:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment